aboutsummaryrefslogtreecommitdiffstats
path: root/module/os
diff options
context:
space:
mode:
Diffstat (limited to 'module/os')
-rw-r--r--module/os/freebsd/spl/acl_common.c1731
-rw-r--r--module/os/freebsd/spl/callb.c372
-rw-r--r--module/os/freebsd/spl/list.c245
-rw-r--r--module/os/freebsd/spl/sha224.h96
-rw-r--r--module/os/freebsd/spl/sha256.h99
-rw-r--r--module/os/freebsd/spl/sha256c.c378
-rw-r--r--module/os/freebsd/spl/sha384.h96
-rw-r--r--module/os/freebsd/spl/sha512.h101
-rw-r--r--module/os/freebsd/spl/sha512c.c508
-rw-r--r--module/os/freebsd/spl/sha512t.h143
-rw-r--r--module/os/freebsd/spl/spl_acl.c222
-rw-r--r--module/os/freebsd/spl/spl_atomic.c138
-rw-r--r--module/os/freebsd/spl/spl_cmn_err.c74
-rw-r--r--module/os/freebsd/spl/spl_dtrace.c37
-rw-r--r--module/os/freebsd/spl/spl_kmem.c351
-rw-r--r--module/os/freebsd/spl/spl_kstat.c321
-rw-r--r--module/os/freebsd/spl/spl_misc.c107
-rw-r--r--module/os/freebsd/spl/spl_policy.c429
-rw-r--r--module/os/freebsd/spl/spl_procfs_list.c79
-rw-r--r--module/os/freebsd/spl/spl_string.c106
-rw-r--r--module/os/freebsd/spl/spl_sunddi.c74
-rw-r--r--module/os/freebsd/spl/spl_sysevent.c259
-rw-r--r--module/os/freebsd/spl/spl_taskq.c329
-rw-r--r--module/os/freebsd/spl/spl_uio.c92
-rw-r--r--module/os/freebsd/spl/spl_vfs.c278
-rw-r--r--module/os/freebsd/spl/spl_vm.c71
-rw-r--r--module/os/freebsd/spl/spl_zlib.c268
-rw-r--r--module/os/freebsd/spl/spl_zone.c265
-rw-r--r--module/os/freebsd/zfs/abd.c1134
-rw-r--r--module/os/freebsd/zfs/arc_os.c245
-rw-r--r--module/os/freebsd/zfs/crypto_os.c613
-rw-r--r--module/os/freebsd/zfs/dmu_os.c346
-rw-r--r--module/os/freebsd/zfs/hkdf.c102
-rw-r--r--module/os/freebsd/zfs/kmod_core.c404
-rw-r--r--module/os/freebsd/zfs/spa_os.c280
-rw-r--r--module/os/freebsd/zfs/spa_stats.c114
-rw-r--r--module/os/freebsd/zfs/sysctl_os.c699
-rw-r--r--module/os/freebsd/zfs/vdev_file.c326
-rw-r--r--module/os/freebsd/zfs/vdev_geom.c1195
-rw-r--r--module/os/freebsd/zfs/vdev_label_os.c74
-rw-r--r--module/os/freebsd/zfs/zfs_acl.c2738
-rw-r--r--module/os/freebsd/zfs/zfs_ctldir.c1345
-rw-r--r--module/os/freebsd/zfs/zfs_debug.c254
-rw-r--r--module/os/freebsd/zfs/zfs_dir.c961
-rw-r--r--module/os/freebsd/zfs/zfs_file_os.c309
-rw-r--r--module/os/freebsd/zfs/zfs_fuid_os.c52
-rw-r--r--module/os/freebsd/zfs/zfs_ioctl_os.c194
-rw-r--r--module/os/freebsd/zfs/zfs_onexit_os.c70
-rw-r--r--module/os/freebsd/zfs/zfs_vfsops.c2448
-rw-r--r--module/os/freebsd/zfs/zfs_vnops.c6533
-rw-r--r--module/os/freebsd/zfs/zfs_znode.c1987
-rw-r--r--module/os/freebsd/zfs/zio_crypt.c1882
-rw-r--r--module/os/freebsd/zfs/zvol_os.c1476
53 files changed, 33050 insertions, 0 deletions
diff --git a/module/os/freebsd/spl/acl_common.c b/module/os/freebsd/spl/acl_common.c
new file mode 100644
index 000000000..8eea4695e
--- /dev/null
+++ b/module/os/freebsd/spl/acl_common.c
@@ -0,0 +1,1731 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
+#if defined(_KERNEL)
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
+#include <sys/debug.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
+#define ASSERT assert
+#endif
+
+#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+ ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define ACL_SYNCHRONIZE_SET_DENY 0x0000001
+#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002
+#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004
+#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008
+
+#define ACL_WRITE_OWNER_SET_DENY 0x0000010
+#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020
+#define ACL_WRITE_OWNER_ERR_DENY 0x0000040
+#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080
+
+#define ACL_DELETE_SET_DENY 0x0000100
+#define ACL_DELETE_SET_ALLOW 0x0000200
+#define ACL_DELETE_ERR_DENY 0x0000400
+#define ACL_DELETE_ERR_ALLOW 0x0000800
+
+#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000
+#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000
+#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000
+#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000
+
+#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000
+#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000
+#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000
+#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000
+
+#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000
+#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000
+#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000
+#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000
+
+#define ACL_READ_NAMED_READER_SET_DENY 0x1000000
+#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000
+#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000
+#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000
+
+
+#define ACE_VALID_MASK_BITS (\
+ ACE_READ_DATA | \
+ ACE_LIST_DIRECTORY | \
+ ACE_WRITE_DATA | \
+ ACE_ADD_FILE | \
+ ACE_APPEND_DATA | \
+ ACE_ADD_SUBDIRECTORY | \
+ ACE_READ_NAMED_ATTRS | \
+ ACE_WRITE_NAMED_ATTRS | \
+ ACE_EXECUTE | \
+ ACE_DELETE_CHILD | \
+ ACE_READ_ATTRIBUTES | \
+ ACE_WRITE_ATTRIBUTES | \
+ ACE_DELETE | \
+ ACE_READ_ACL | \
+ ACE_WRITE_ACL | \
+ ACE_WRITE_OWNER | \
+ ACE_SYNCHRONIZE)
+
+#define ACE_MASK_UNDEFINED 0x80000000
+
+#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+ ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+ ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+ ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+ ace_unused,
+ ace_user_obj,
+ ace_user,
+ ace_group, /* includes GROUP and GROUP_OBJ */
+ ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+ uid_t key;
+ avl_node_t avl;
+ uint32_t mask;
+ uint32_t allowed;
+ uint32_t denied;
+ int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+ acevals_t user_obj;
+ avl_tree_t user;
+ int numusers;
+ acevals_t group_obj;
+ avl_tree_t group;
+ int numgroups;
+ acevals_t other_obj;
+ uint32_t acl_mask;
+ int hasmask;
+ int dfacl_flag;
+ ace_to_aent_state_t state;
+ int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ * returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)())
+{
+ int g, i, j, ii;
+ unsigned int *p1, *p2;
+ unsigned int tmp;
+
+ /* No work to do */
+ if (v == NULL || n <= 1)
+ return;
+
+ /* Sanity check on arguments */
+ ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+ ASSERT(s > 0);
+ for (g = n / 2; g > 0; g /= 2) {
+ for (i = g; i < n; i++) {
+ for (j = i - g; j >= 0 &&
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
+ p1 = (void *)(v + j * s);
+ p2 = (void *)(v + (j + g) * s);
+ for (ii = 0; ii < s / 4; ii++) {
+ tmp = *p1;
+ *p1++ = *p2;
+ *p2++ = tmp;
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two acls, all fields. Returns:
+ * -1 (less than)
+ * 0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+ aclent_t *x = (aclent_t *)a;
+ aclent_t *y = (aclent_t *)b;
+
+ /* Compare types */
+ if (x->a_type < y->a_type)
+ return (-1);
+ if (x->a_type > y->a_type)
+ return (1);
+ /* Equal types; compare id's */
+ if (x->a_id < y->a_id)
+ return (-1);
+ if (x->a_id > y->a_id)
+ return (1);
+ /* Equal ids; compare perms */
+ if (x->a_perm < y->a_perm)
+ return (-1);
+ if (x->a_perm > y->a_perm)
+ return (1);
+ /* Totally equal */
+ return (0);
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+ *ptr = kmem_zalloc(size, KM_SLEEP);
+ return (0);
+}
+
+
+#if !defined(_KERNEL)
+acl_t *
+acl_alloc(enum acl_type type)
+{
+ acl_t *aclp;
+
+ if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+ return (NULL);
+
+ aclp->acl_aclp = NULL;
+ aclp->acl_cnt = 0;
+
+ switch (type) {
+ case ACE_T:
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ break;
+ case ACLENT_T:
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ break;
+ default:
+ acl_free(aclp);
+ aclp = NULL;
+ }
+ return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+ int acl_size;
+
+ if (aclp == NULL)
+ return;
+
+ if (aclp->acl_aclp) {
+ acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+ cacl_free(aclp->acl_aclp, acl_size);
+ }
+
+ cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+ uint32_t access_mask = 0;
+ int acl_produce;
+ int synchronize_set = 0, write_owner_set = 0;
+ int delete_set = 0, write_attrs_set = 0;
+ int read_named_set = 0, write_named_set = 0;
+
+ acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+ if (isallow) {
+ synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+ write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+ delete_set = ACL_DELETE_SET_ALLOW;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ } else {
+
+ synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+ write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+ delete_set = ACL_DELETE_SET_DENY;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ else
+ /*
+ * If the entity is not the owner and does not
+ * have write permissions ACE_WRITE_ATTRIBUTES will
+ * always go in the DENY ACE.
+ */
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ }
+
+ if (acl_produce & synchronize_set)
+ access_mask |= ACE_SYNCHRONIZE;
+ if (acl_produce & write_owner_set)
+ access_mask |= ACE_WRITE_OWNER;
+ if (acl_produce & delete_set)
+ access_mask |= ACE_DELETE;
+ if (acl_produce & write_attrs_set)
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ if (acl_produce & read_named_set)
+ access_mask |= ACE_READ_NAMED_ATTRS;
+ if (acl_produce & write_named_set)
+ access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+ return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
+{
+ uint32_t access = 0;
+ int haswriteperm = 0;
+ int hasreadperm = 0;
+
+ if (isallow) {
+ haswriteperm = (mode & S_IWOTH);
+ hasreadperm = (mode & S_IROTH);
+ } else {
+ haswriteperm = !(mode & S_IWOTH);
+ hasreadperm = !(mode & S_IROTH);
+ }
+
+ /*
+ * The following call takes care of correctly setting the following
+ * mask bits in the access_mask:
+ * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+ * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+ */
+ access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+ if (isallow) {
+ access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+ if (isowner)
+ access |= ACE_WRITE_ACL;
+ } else {
+ if (! isowner)
+ access |= ACE_WRITE_ACL;
+ }
+
+ /* read */
+ if (mode & S_IROTH) {
+ access |= ACE_READ_DATA;
+ }
+ /* write */
+ if (mode & S_IWOTH) {
+ access |= ACE_WRITE_DATA |
+ ACE_APPEND_DATA;
+ if (isdir)
+ access |= ACE_DELETE_CHILD;
+ }
+ /* exec */
+ if (mode & S_IXOTH) {
+ access |= ACE_EXECUTE;
+ }
+
+ return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+ (void) memcpy(deny, allow, sizeof (ace_t));
+
+ deny->a_who = allow->a_who;
+
+ deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+ deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+ ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+ ACE_WRITE_NAMED_ATTRS);
+ deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+ ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+ B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's. Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+ int *hasmask, mode_t *mask,
+ int *numuser, int *numgroup, int *needsort)
+{
+ int error = 0;
+ int i;
+ int curtype = 0;
+
+ *hasmask = 0;
+ *mask = 07;
+ *needsort = 0;
+ *numuser = 0;
+ *numgroup = 0;
+
+ for (i = 0; i < n; i++) {
+ if (aclent[i].a_type < curtype)
+ *needsort = 1;
+ else if (aclent[i].a_type > curtype)
+ curtype = aclent[i].a_type;
+ if (aclent[i].a_type & USER)
+ (*numuser)++;
+ if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+ (*numgroup)++;
+ if (aclent[i].a_type & CLASS_OBJ) {
+ if (*hasmask) {
+ error = EINVAL;
+ goto out;
+ } else {
+ *hasmask = 1;
+ *mask = aclent[i].a_perm;
+ }
+ }
+ }
+
+ if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+ int error = 0;
+ mode_t mask;
+ int numuser, numgroup, needsort;
+ int resultsize = 0;
+ int i, groupi = 0, skip;
+ ace_t *acep, *result = NULL;
+ int hasmask;
+
+ error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+ &numuser, &numgroup, &needsort);
+ if (error != 0)
+ goto out;
+
+ /* allow + deny for each aclent */
+ resultsize = n * 2;
+ if (hasmask) {
+ /*
+ * stick extra deny on the group_obj and on each
+ * user|group for the mask (the group_obj was added
+ * into the count for numgroup)
+ */
+ resultsize += numuser + numgroup;
+ /* ... and don't count the mask itself */
+ resultsize -= 2;
+ }
+
+ /* sort the source if necessary */
+ if (needsort)
+ ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+ if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+ goto out;
+
+ acep = result;
+
+ for (i = 0; i < n; i++) {
+ /*
+ * don't process CLASS_OBJ (mask); mask was grabbed in
+ * ln_aent_preprocess()
+ */
+ if (aclent[i].a_type & CLASS_OBJ)
+ continue;
+
+ /* If we need an ACL_MASK emulator, prepend it now */
+ if ((hasmask) &&
+ (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+ acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ acep->a_flags = 0;
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |=
+ (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ }
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+ /*
+ * Set the access mask for the prepended deny
+ * ace. To do this, we invert the mask (found
+ * in ln_aent_preprocess()) then convert it to an
+ * DENY ace access_mask.
+ */
+ acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+ isdir, 0, 0);
+ acep += 1;
+ }
+
+ /* handle a_perm -> access_mask */
+ acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+ isdir, aclent[i].a_type & USER_OBJ, 1);
+
+ /* emulate a default aclent */
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+
+ /*
+ * handle a_perm and a_id
+ *
+ * this must be done last, since it involves the
+ * corresponding deny aces, which are handled
+ * differently for each different a_type.
+ */
+ if (aclent[i].a_type & USER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_OWNER;
+ ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+ acep += 2;
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_GROUP;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ }
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ /*
+ * Set the corresponding deny for the group ace.
+ *
+ * The deny aces go after all of the groups, unlike
+ * everything else, where they immediately follow
+ * the allow ace.
+ *
+ * We calculate "skip", the number of slots to
+ * skip ahead for the deny ace, here.
+ *
+ * The pattern is:
+ * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+ * thus, skip is
+ * (2 * numgroup) - 1 - groupi
+ * (2 * numgroup) to account for MD + A
+ * - 1 to account for the fact that we're on the
+ * access (A), not the mask (MD)
+ * - groupi to account for the fact that we have
+ * passed up groupi number of MD's.
+ */
+ skip = (2 * numgroup) - 1 - groupi;
+ ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+ /*
+ * If we just did the last group, skip acep past
+ * all of the denies; else, just move ahead one.
+ */
+ if (++groupi >= numgroup)
+ acep += numgroup + 1;
+ else
+ acep += 1;
+ } else if (aclent[i].a_type & OTHER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_EVERYONE;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ *acepp = result;
+ *rescount = resultsize;
+
+out:
+ if (error != 0) {
+ if ((result != NULL) && (resultsize > 0)) {
+ cacl_free(result, resultsize * sizeof (ace_t));
+ }
+ }
+
+ return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
+ ace_t **retacep, int *retacecnt)
+{
+ ace_t *acep;
+ ace_t *dfacep;
+ int acecnt = 0;
+ int dfacecnt = 0;
+ int dfaclstart = 0;
+ int dfaclcnt = 0;
+ aclent_t *aclp;
+ int i;
+ int error;
+ int acesz, dfacesz;
+
+ ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+ for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+ if (aclp->a_type & ACL_DEFAULT)
+ break;
+ }
+
+ if (i < aclcnt) {
+ dfaclstart = i;
+ dfaclcnt = aclcnt - i;
+ }
+
+ if (dfaclcnt && !isdir) {
+ return (EINVAL);
+ }
+
+ error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir);
+ if (error)
+ return (error);
+
+ if (dfaclcnt) {
+ error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+ &dfacep, &dfacecnt, isdir);
+ if (error) {
+ if (acep) {
+ cacl_free(acep, acecnt * sizeof (ace_t));
+ }
+ return (error);
+ }
+ }
+
+ if (dfacecnt != 0) {
+ acesz = sizeof (ace_t) * acecnt;
+ dfacesz = sizeof (ace_t) * dfacecnt;
+ acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+ if (acep == NULL)
+ return (ENOMEM);
+ if (dfaclcnt) {
+ (void) memcpy(acep + acecnt, dfacep, dfacesz);
+ }
+ }
+ if (dfaclcnt)
+ cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+ *retacecnt = acecnt + dfacecnt;
+ *retacep = acep;
+ return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+ int error = 0;
+ o_mode_t mode = 0;
+ uint32_t bits, wantbits;
+
+ /* read */
+ if (mask & ACE_READ_DATA)
+ mode |= S_IROTH;
+
+ /* write */
+ wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+ if (isdir)
+ wantbits |= ACE_DELETE_CHILD;
+ bits = mask & wantbits;
+ if (bits != 0) {
+ if (bits != wantbits) {
+ error = ENOTSUP;
+ goto out;
+ }
+ mode |= S_IWOTH;
+ }
+
+ /* exec */
+ if (mask & ACE_EXECUTE) {
+ mode |= S_IXOTH;
+ }
+
+ *modep = mode;
+
+out:
+ return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+ bzero(vals, sizeof (*vals));
+ vals->allowed = ACE_MASK_UNDEFINED;
+ vals->denied = ACE_MASK_UNDEFINED;
+ vals->mask = ACE_MASK_UNDEFINED;
+ vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+ acevals_init(&al->user_obj, 0);
+ acevals_init(&al->group_obj, 0);
+ acevals_init(&al->other_obj, 0);
+ al->numusers = 0;
+ al->numgroups = 0;
+ al->acl_mask = 0;
+ al->hasmask = 0;
+ al->state = ace_unused;
+ al->seen = 0;
+ al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+ acevals_t key, *rc;
+ avl_index_t where;
+
+ key.key = ace->a_who;
+ rc = avl_find(avl, &key, &where);
+ if (rc != NULL)
+ return (rc);
+
+ /* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+ if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+ return (NULL);
+
+ acevals_init(rc, ace->a_who);
+ avl_insert(avl, rc, where);
+ (*num)++;
+
+ return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+ int set_deny, err_deny;
+ int set_allow, err_allow;
+ int acl_consume;
+ int haswriteperm, hasreadperm;
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+ } else {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+ }
+
+ acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+ ACL_DELETE_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY |
+ ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+ ACL_WRITE_NAMED_WRITER_ERR_DENY |
+ ACL_READ_NAMED_READER_ERR_DENY);
+
+ if (mask_bit == ACE_SYNCHRONIZE) {
+ set_deny = ACL_SYNCHRONIZE_SET_DENY;
+ err_deny = ACL_SYNCHRONIZE_ERR_DENY;
+ set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+ err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_OWNER) {
+ set_deny = ACL_WRITE_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+ } else if (mask_bit == ACE_DELETE) {
+ set_deny = ACL_DELETE_SET_DENY;
+ err_deny = ACL_DELETE_ERR_DENY;
+ set_allow = ACL_DELETE_SET_ALLOW;
+ err_allow = ACL_DELETE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+ if (isowner) {
+ set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+ } else if (haswriteperm) {
+ set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+ } else {
+ if ((acep->a_access_mask & mask_bit) &&
+ (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ return (ENOTSUP);
+ }
+ return (0);
+ }
+ } else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+ if (!hasreadperm)
+ return (0);
+
+ set_deny = ACL_READ_NAMED_READER_SET_DENY;
+ err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+ set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+ err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+ if (!haswriteperm)
+ return (0);
+
+ set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+ } else {
+ return (EINVAL);
+ }
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ if (acl_consume & set_deny) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_deny) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ } else {
+ /* ACE_ACCESS_ALLOWED_ACE_TYPE */
+ if (acl_consume & set_allow) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_allow) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ }
+ return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+ int error = 0;
+ int isowner;
+
+ /* only ALLOW or DENY */
+ if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid flags */
+ if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* some flags are illegal */
+ if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+ ACE_FAILED_ACCESS_ACE_FLAG |
+ ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid masks */
+ if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ isowner = 1;
+ } else {
+ isowner = 0;
+ }
+
+ error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_DELETE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ /* more detailed checking of masks */
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+ (acep->a_access_mask & ACE_APPEND_DATA)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+ /* ACL enforcement */
+ if ((acep->a_access_mask & ACE_READ_ACL) &&
+ (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (acep->a_access_mask & ACE_WRITE_ACL) {
+ if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+ (isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (! isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+out:
+ return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+ /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+ (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+ return (ENOTSUP);
+ }
+
+ return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+ uid_t owner, gid_t group, boolean_t isdir)
+{
+ int error;
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ if (vals->allowed != (vals->denied ^ flips)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+ (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ dest->a_type = vals->aent_type;
+ if (dest->a_type & (USER | GROUP)) {
+ dest->a_id = vals->key;
+ } else if (dest->a_type & USER_OBJ) {
+ dest->a_id = owner;
+ } else if (dest->a_type & GROUP_OBJ) {
+ dest->a_id = group;
+ } else if (dest->a_type & OTHER_OBJ) {
+ dest->a_id = 0;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+ uid_t owner, gid_t group, boolean_t isdir)
+{
+ int error = 0;
+ aclent_t *aent, *result = NULL;
+ acevals_t *vals;
+ int resultcount;
+
+ if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+ (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ resultcount = 3 + list->numusers + list->numgroups;
+ /*
+ * This must be the same condition as below, when we add the CLASS_OBJ
+ * (aka ACL mask)
+ */
+ if ((list->hasmask) || (! list->dfacl_flag))
+ resultcount += 1;
+
+ if (cacl_malloc((void **)&result,
+ resultcount * sizeof (aclent_t)) != 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ aent = result;
+
+ /* USER_OBJ */
+ if (!(list->user_obj.aent_type & USER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+ isdir);
+
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* USER */
+ vals = NULL;
+ for (vals = avl_first(&list->user); vals != NULL;
+ vals = AVL_NEXT(&list->user, vals)) {
+ if (!(vals->aent_type & USER)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /* GROUP_OBJ */
+ if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* GROUP */
+ vals = NULL;
+ for (vals = avl_first(&list->group); vals != NULL;
+ vals = AVL_NEXT(&list->group, vals)) {
+ if (!(vals->aent_type & GROUP)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /*
+ * CLASS_OBJ (aka ACL_MASK)
+ *
+ * An ACL_MASK is not fabricated if the ACL is a default ACL.
+ * This is to follow UFS's behavior.
+ */
+ if ((list->hasmask) || (! list->dfacl_flag)) {
+ if (list->hasmask) {
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ error = ace_mask_to_mode(list->acl_mask ^ flips,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ } else {
+ /* fabricate the ACL_MASK from the group permissions */
+ error = ace_mask_to_mode(list->group_obj.allowed,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ }
+ aent->a_id = 0;
+ aent->a_type = CLASS_OBJ | list->dfacl_flag;
+ ++aent;
+ }
+ /* OTHER_OBJ */
+ if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+
+ *aclentp = result;
+ *aclcnt = resultcount;
+
+out:
+ if (error != 0) {
+ if (result != NULL)
+ cacl_free(result, resultcount * sizeof (aclent_t));
+ }
+
+ return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+ acevals_t *node;
+ void *cookie;
+
+ if (al == NULL)
+ return;
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+
+ avl_destroy(&al->user);
+ avl_destroy(&al->group);
+
+ /* free the container itself */
+ cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+ const acevals_t *a = va, *b = vb;
+
+ if (a->key == b->key)
+ return (0);
+
+ if (a->key > b->key)
+ return (1);
+
+ else
+ return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists. Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+ aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+ boolean_t isdir)
+{
+ int error = 0;
+ ace_t *acep;
+ uint32_t bits;
+ int i;
+ ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+ acevals_t *vals;
+
+ *aclentp = NULL;
+ *aclcnt = 0;
+ *dfaclentp = NULL;
+ *dfaclcnt = 0;
+
+ /* we need at least user_obj, group_obj, and other_obj */
+ if (n < 6) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (ace == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+
+ ace_list_init(normacl, 0);
+
+ error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ ace_list_init(dfacl, ACL_DEFAULT);
+
+ /* process every ace_t... */
+ for (i = 0; i < n; i++) {
+ acep = &ace[i];
+
+ /* rule out certain cases quickly */
+ error = ace_to_aent_legal(acep);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Turn off these bits in order to not have to worry about
+ * them when doing the checks for compliments.
+ */
+ acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+ ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+ ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+ /* see if this should be a regular or default acl */
+ bits = acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE);
+ if (bits != 0) {
+ /* all or nothing on these inherit bits */
+ if (bits != (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl = dfacl;
+ } else {
+ acl = normacl;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ if (acl->state > ace_user_obj) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user_obj;
+ acl->seen |= USER_OBJ;
+ vals = &acl->user_obj;
+ vals->aent_type = USER_OBJ | acl->dfacl_flag;
+ } else if ((acep->a_flags & ACE_EVERYONE)) {
+ acl->state = ace_other_obj;
+ acl->seen |= OTHER_OBJ;
+ vals = &acl->other_obj;
+ vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+ } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+ if (acl->state > ace_group) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_flags & ACE_GROUP)) {
+ acl->seen |= GROUP_OBJ;
+ vals = &acl->group_obj;
+ vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+ } else {
+ acl->seen |= GROUP;
+ vals = acevals_find(acep, &acl->group,
+ &acl->numgroups);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = GROUP | acl->dfacl_flag;
+ }
+ acl->state = ace_group;
+ } else {
+ if (acl->state > ace_user) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user;
+ acl->seen |= USER;
+ vals = acevals_find(acep, &acl->user,
+ &acl->numusers);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = USER | acl->dfacl_flag;
+ }
+
+ if (!(acl->state > ace_unused)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ /* no more than one allowed per aclent_t */
+ if (vals->allowed != ACE_MASK_UNDEFINED) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->allowed = acep->a_access_mask;
+ } else {
+ /*
+ * it's a DENY; if there was a previous DENY, it
+ * must have been an ACL_MASK.
+ */
+ if (vals->denied != ACE_MASK_UNDEFINED) {
+ /* ACL_MASK is for USER and GROUP only */
+ if ((acl->state != ace_user) &&
+ (acl->state != ace_group)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ if (! acl->hasmask) {
+ acl->hasmask = 1;
+ acl->acl_mask = vals->denied;
+ /* check for mismatched ACL_MASK emulations */
+ } else if (acl->acl_mask != vals->denied) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->mask = vals->denied;
+ }
+ vals->denied = acep->a_access_mask;
+ }
+ }
+
+ /* done collating; produce the aclent_t lists */
+ if (normacl->state != ace_unused) {
+ error = ace_list_to_aent(normacl, aclentp, aclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+ if (dfacl->state != ace_unused) {
+ error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+
+out:
+ if (normacl != NULL)
+ ace_list_free(normacl);
+ if (dfacl != NULL)
+ ace_list_free(dfacl);
+
+ return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
+ uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+ int error = 0;
+ aclent_t *aclentp, *dfaclentp;
+ int aclcnt, dfaclcnt;
+ int aclsz, dfaclsz;
+
+ error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+ &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+ if (error)
+ return (error);
+
+
+ if (dfaclcnt != 0) {
+ /*
+ * Slap aclentp and dfaclentp into a single array.
+ */
+ aclsz = sizeof (aclent_t) * aclcnt;
+ dfaclsz = sizeof (aclent_t) * dfaclcnt;
+ aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+ if (aclentp != NULL) {
+ (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+ } else {
+ error = ENOMEM;
+ }
+ }
+
+ if (aclentp) {
+ *retaclentp = aclentp;
+ *retaclcnt = aclcnt + dfaclcnt;
+ }
+
+ if (dfaclentp)
+ cacl_free(dfaclentp, dfaclsz);
+
+ return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
+ gid_t group)
+{
+ int aclcnt;
+ void *acldata;
+ int error;
+
+ /*
+ * See if we need to translate
+ */
+ if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+ (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACLENT_T))
+ return (0);
+
+ if (target_flavor == -1) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (target_flavor == _ACL_ACE_ENABLED &&
+ aclp->acl_type == ACLENT_T) {
+ error = convert_aent_to_ace(aclp->acl_aclp,
+ aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+
+ } else if (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACE_T) {
+ error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+ isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+ } else {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * replace old acl with newly translated acl
+ */
+ cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+ aclp->acl_aclp = acldata;
+ aclp->acl_cnt = aclcnt;
+ if (target_flavor == _ACL_ACE_ENABLED) {
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ } else {
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ }
+ return (0);
+
+out:
+
+#if !defined(_KERNEL)
+ errno = error;
+ return (-1);
+#else
+ return (error);
+#endif
+}
+#endif /* !_KERNEL */
+
+#define SET_ACE(acl, index, who, mask, type, flags) { \
+ acl[0][index].a_who = (uint32_t)who; \
+ acl[0][index].a_type = type; \
+ acl[0][index].a_flags = flags; \
+ acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+ uint32_t read_mask = ACE_READ_DATA;
+ uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+ uint32_t execute_mask = ACE_EXECUTE;
+
+ (void) isdir; /* will need this later */
+
+ masks->deny1 = 0;
+ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+ masks->deny1 |= read_mask;
+ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+ masks->deny1 |= write_mask;
+ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+ masks->deny1 |= execute_mask;
+
+ masks->deny2 = 0;
+ if (!(mode & S_IRGRP) && (mode & S_IROTH))
+ masks->deny2 |= read_mask;
+ if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+ masks->deny2 |= write_mask;
+ if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+ masks->deny2 |= execute_mask;
+
+ masks->allow0 = 0;
+ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+ masks->allow0 |= read_mask;
+ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+ masks->allow0 |= write_mask;
+ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+ masks->allow0 |= execute_mask;
+
+ masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+ if (mode & S_IRUSR)
+ masks->owner |= read_mask;
+ if (mode & S_IWUSR)
+ masks->owner |= write_mask;
+ if (mode & S_IXUSR)
+ masks->owner |= execute_mask;
+
+ masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IRGRP)
+ masks->group |= read_mask;
+ if (mode & S_IWGRP)
+ masks->group |= write_mask;
+ if (mode & S_IXGRP)
+ masks->group |= execute_mask;
+
+ masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IROTH)
+ masks->everyone |= read_mask;
+ if (mode & S_IWOTH)
+ masks->everyone |= write_mask;
+ if (mode & S_IXOTH)
+ masks->everyone |= execute_mask;
+}
+
+int
+acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
+{
+ int index = 0;
+ int error;
+ trivial_acl_t masks;
+
+ *count = 3;
+ acl_trivial_access_masks(mode, isdir, &masks);
+
+ if (masks.allow0)
+ (*count)++;
+ if (masks.deny1)
+ (*count)++;
+ if (masks.deny2)
+ (*count)++;
+
+ if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+ return (error);
+
+ if (masks.allow0) {
+ SET_ACE(acl, index, -1, masks.allow0,
+ ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+ }
+ if (masks.deny1) {
+ SET_ACE(acl, index, -1, masks.deny1,
+ ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
+ }
+ if (masks.deny2) {
+ SET_ACE(acl, index, -1, masks.deny2,
+ ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
+ }
+
+ SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_OWNER);
+ SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_EVERYONE);
+
+ return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
+{
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
+
+ while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+ switch (flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ case ACE_EVERYONE:
+ break;
+ default:
+ return (1);
+
+ }
+
+ if (flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Delete permissions are never set by default
+ */
+ if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+ return (1);
+ /*
+ * only allow owner@ to have
+ * write_acl/write_owner/write_attributes/write_xattr/
+ */
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS))))
+ return (1);
+
+ }
+ return (0);
+}
+
+uint64_t
+ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
+ uint16_t *type, uint32_t *mask)
+{
+ ace_t *acep = datap;
+
+ if (cookie >= aclcnt)
+ return (0);
+
+ *flags = acep[cookie].a_flags;
+ *type = acep[cookie].a_type;
+ *mask = acep[cookie++].a_access_mask;
+
+ return (cookie);
+}
+
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+ return (ace_trivial_common(acep, aclcnt, ace_walk));
+}
diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c
new file mode 100644
index 000000000..d4a0e141c
--- /dev/null
+++ b/module/os/freebsd/spl/callb.c
@@ -0,0 +1,372 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/systm.h> /* for delay() */
+#include <sys/taskq.h> /* For TASKQ_NAMELEN */
+#include <sys/kernel.h>
+
+#define CB_MAXNAME TASKQ_NAMELEN
+
+/*
+ * The callb mechanism provides generic event scheduling/echoing.
+ * A callb function is registered and called on behalf of the event.
+ */
+typedef struct callb {
+ struct callb *c_next; /* next in class or on freelist */
+ kthread_id_t c_thread; /* ptr to caller's thread struct */
+ char c_flag; /* info about the callb state */
+ uchar_t c_class; /* this callb's class */
+ kcondvar_t c_done_cv; /* signal callb completion */
+ boolean_t (*c_func)(); /* cb function: returns true if ok */
+ void *c_arg; /* arg to c_func */
+ char c_name[CB_MAXNAME+1]; /* debug:max func name length */
+} callb_t;
+
+/*
+ * callb c_flag bitmap definitions
+ */
+#define CALLB_FREE 0x0
+#define CALLB_TAKEN 0x1
+#define CALLB_EXECUTING 0x2
+
+/*
+ * Basic structure for a callb table.
+ * All callbs are organized into different class groups described
+ * by ct_class array.
+ * The callbs within a class are single-linked and normally run by a
+ * serial execution.
+ */
+typedef struct callb_table {
+ kmutex_t ct_lock; /* protect all callb states */
+ callb_t *ct_freelist; /* free callb structures */
+ int ct_busy; /* != 0 prevents additions */
+ kcondvar_t ct_busy_cv; /* to wait for not busy */
+ int ct_ncallb; /* num of callbs allocated */
+ callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */
+} callb_table_t;
+
+int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
+
+static callb_id_t callb_add_common(boolean_t (*)(void *, int),
+ void *, int, char *, kthread_id_t);
+
+static callb_table_t callb_table; /* system level callback table */
+static callb_table_t *ct = &callb_table;
+static kmutex_t callb_safe_mutex;
+callb_cpr_t callb_cprinfo_safe = {
+ &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} };
+
+/*
+ * Init all callb tables in the system.
+ */
+void
+callb_init(void *dummy __unused)
+{
+ callb_table.ct_busy = 0; /* mark table open for additions */
+ mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+callb_fini(void *dummy __unused)
+{
+ callb_t *cp;
+ int i;
+
+ mutex_enter(&ct->ct_lock);
+ for (i = 0; i < 16; i++) {
+ while ((cp = ct->ct_freelist) != NULL) {
+ ct->ct_freelist = cp->c_next;
+ ct->ct_ncallb--;
+ kmem_free(cp, sizeof (callb_t));
+ }
+ if (ct->ct_ncallb == 0)
+ break;
+ /* Not all callbacks finished, waiting for the rest. */
+ mutex_exit(&ct->ct_lock);
+ tsleep(ct, 0, "callb", hz / 4);
+ mutex_enter(&ct->ct_lock);
+ }
+ if (ct->ct_ncallb > 0)
+ printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
+ mutex_exit(&ct->ct_lock);
+ mutex_destroy(&callb_safe_mutex);
+ mutex_destroy(&callb_table.ct_lock);
+}
+
+/*
+ * callout_add() is called to register func() be called later.
+ */
+static callb_id_t
+callb_add_common(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ callb_t *cp;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+ while (ct->ct_busy)
+ cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
+ if ((cp = ct->ct_freelist) == NULL) {
+ ct->ct_ncallb++;
+ cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
+ }
+ ct->ct_freelist = cp->c_next;
+ cp->c_thread = t;
+ cp->c_func = func;
+ cp->c_arg = arg;
+ cp->c_class = (uchar_t)class;
+ cp->c_flag |= CALLB_TAKEN;
+#ifdef DEBUG
+ if (strlen(name) > CB_MAXNAME)
+ cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
+ "too long -- truncated to %d chars",
+ name, CB_MAXNAME);
+#endif
+ (void) strncpy(cp->c_name, name, CB_MAXNAME);
+ cp->c_name[CB_MAXNAME] = '\0';
+
+ /*
+ * Insert the new callb at the head of its class list.
+ */
+ cp->c_next = ct->ct_first_cb[class];
+ ct->ct_first_cb[class] = cp;
+
+ mutex_exit(&ct->ct_lock);
+ return ((callb_id_t)cp);
+}
+
+/*
+ * The default function to add an entry to the callback table. Since
+ * it uses curthread as the thread identifier to store in the table,
+ * it should be used for the normal case of a thread which is calling
+ * to add ITSELF to the table.
+ */
+callb_id_t
+callb_add(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name)
+{
+ return (callb_add_common(func, arg, class, name, curthread));
+}
+
+/*
+ * A special version of callb_add() above for use by threads which
+ * might be adding an entry to the table on behalf of some other
+ * thread (for example, one which is constructed but not yet running).
+ * In this version the thread id is an argument.
+ */
+callb_id_t
+callb_add_thread(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ return (callb_add_common(func, arg, class, name, t));
+}
+
+/*
+ * callout_delete() is called to remove an entry identified by id
+ * that was originally placed there by a call to callout_add().
+ * return -1 if fail to delete a callb entry otherwise return 0.
+ */
+int
+callb_delete(callb_id_t id)
+{
+ callb_t **pp;
+ callb_t *me = (callb_t *)id;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (;;) {
+ pp = &ct->ct_first_cb[me->c_class];
+ while (*pp != NULL && *pp != me)
+ pp = &(*pp)->c_next;
+
+#ifdef DEBUG
+ if (*pp != me) {
+ cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
+ (void *)me);
+ mutex_exit(&ct->ct_lock);
+ return (-1);
+ }
+#endif /* DEBUG */
+
+ /*
+ * It is not allowed to delete a callb in the middle of
+ * executing otherwise, the callb_execute() will be confused.
+ */
+ if (!(me->c_flag & CALLB_EXECUTING))
+ break;
+
+ cv_wait(&me->c_done_cv, &ct->ct_lock);
+ }
+ /* relink the class list */
+ *pp = me->c_next;
+
+ /* clean up myself and return the free callb to the head of freelist */
+ me->c_flag = CALLB_FREE;
+ me->c_next = ct->ct_freelist;
+ ct->ct_freelist = me;
+
+ mutex_exit(&ct->ct_lock);
+ return (0);
+}
+
+/*
+ * class: indicates to execute all callbs in the same class;
+ * code: optional argument for the callb functions.
+ * return: = 0: success
+ * != 0: ptr to string supplied when callback was registered
+ */
+void *
+callb_execute_class(int class, int code)
+{
+ callb_t *cp;
+ void *ret = NULL;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[class];
+ cp != NULL && ret == 0; cp = cp->c_next) {
+ while (cp->c_flag & CALLB_EXECUTING)
+ cv_wait(&cp->c_done_cv, &ct->ct_lock);
+ /*
+ * cont if the callb is deleted while we're sleeping
+ */
+ if (cp->c_flag == CALLB_FREE)
+ continue;
+ cp->c_flag |= CALLB_EXECUTING;
+
+#ifdef CALLB_DEBUG
+ printf("callb_execute: name=%s func=%p arg=%p\n",
+ cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
+#endif /* CALLB_DEBUG */
+
+ mutex_exit(&ct->ct_lock);
+ /* If callback function fails, pass back client's name */
+ if (!(*cp->c_func)(cp->c_arg, code))
+ ret = cp->c_name;
+ mutex_enter(&ct->ct_lock);
+
+ cp->c_flag &= ~CALLB_EXECUTING;
+ cv_broadcast(&cp->c_done_cv);
+ }
+ mutex_exit(&ct->ct_lock);
+ return (ret);
+}
+
+/*
+ * callers make sure no recursive entries to this func.
+ * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
+ *
+ * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
+ * use a cv_timedwait() in case the kernel thread is blocked.
+ *
+ * Note that this is a generic callback handler for daemon CPR and
+ * should NOT be changed to accommodate any specific requirement in a daemon.
+ * Individual daemons that require changes to the handler shall write
+ * callback routines in their own daemon modules.
+ */
+boolean_t
+callb_generic_cpr(void *arg, int code)
+{
+ callb_cpr_t *cp = (callb_cpr_t *)arg;
+ clock_t ret = 0; /* assume success */
+
+ mutex_enter(cp->cc_lockp);
+
+ switch (code) {
+ case CB_CODE_CPR_CHKPT:
+ cp->cc_events |= CALLB_CPR_START;
+#ifdef CPR_NOT_THREAD_SAFE
+ while (!(cp->cc_events & CALLB_CPR_SAFE))
+ /* cv_timedwait() returns -1 if it times out. */
+ if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+ cp->cc_lockp, (callb_timeout_sec * hz),
+ TR_CLOCK_TICK)) == -1)
+ break;
+#endif
+ break;
+
+ case CB_CODE_CPR_RESUME:
+ cp->cc_events &= ~CALLB_CPR_START;
+ cv_signal(&cp->cc_stop_cv);
+ break;
+ }
+ mutex_exit(cp->cc_lockp);
+ return (ret != -1);
+}
+
+/*
+ * The generic callback function associated with kernel threads which
+ * are always considered safe.
+ */
+/* ARGSUSED */
+boolean_t
+callb_generic_cpr_safe(void *arg, int code)
+{
+ return (B_TRUE);
+}
+/*
+ * Prevent additions to callback table.
+ */
+void
+callb_lock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy == 0);
+ ct->ct_busy = 1;
+ mutex_exit(&ct->ct_lock);
+}
+
+/*
+ * Allow additions to callback table.
+ */
+void
+callb_unlock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy != 0);
+ ct->ct_busy = 0;
+ cv_broadcast(&ct->ct_busy_cv);
+ mutex_exit(&ct->ct_lock);
+}
+
+SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
+SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/module/os/freebsd/spl/list.c b/module/os/freebsd/spl/list.c
new file mode 100644
index 000000000..e8db13a5c
--- /dev/null
+++ b/module/os/freebsd/spl/list.c
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Generic doubly-linked list implementation
+ */
+
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
+
+#define list_insert_after_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_prev = (node); \
+ lnew->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = lnew; \
+ (node)->list_next = lnew; \
+}
+
+#define list_insert_before_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_next = (node); \
+ lnew->list_prev = (node)->list_prev; \
+ (node)->list_prev->list_next = lnew; \
+ (node)->list_prev = lnew; \
+}
+
+#define list_remove_node(node) \
+ (node)->list_prev->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = (node)->list_prev; \
+ (node)->list_next = (node)->list_prev = NULL
+
+void
+list_create(list_t *list, size_t size, size_t offset)
+{
+ ASSERT(list);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (list_node_t));
+
+ list->list_size = size;
+ list->list_offset = offset;
+ list->list_head.list_next = list->list_head.list_prev =
+ &list->list_head;
+}
+
+void
+list_destroy(list_t *list)
+{
+ list_node_t *node = &list->list_head;
+
+ ASSERT(list);
+ ASSERT(list->list_head.list_next == node);
+ ASSERT(list->list_head.list_prev == node);
+
+ node->list_next = node->list_prev = NULL;
+}
+
+void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL) {
+ list_insert_head(list, nobject);
+ } else {
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_after_node(list, lold, nobject);
+ }
+}
+
+void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL) {
+ list_insert_tail(list, nobject);
+ } else {
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_before_node(list, lold, nobject);
+ }
+}
+
+void
+list_insert_head(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_after_node(list, lold, object);
+}
+
+void
+list_insert_tail(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_before_node(list, lold, object);
+}
+
+void
+list_remove(list_t *list, void *object)
+{
+ list_node_t *lold = list_d2l(list, object);
+ ASSERT(!list_empty(list));
+ ASSERT(lold->list_next != NULL);
+ list_remove_node(lold);
+}
+
+void *
+list_remove_head(list_t *list)
+{
+ list_node_t *head = list->list_head.list_next;
+ if (head == &list->list_head)
+ return (NULL);
+ list_remove_node(head);
+ return (list_object(list, head));
+}
+
+void *
+list_remove_tail(list_t *list)
+{
+ list_node_t *tail = list->list_head.list_prev;
+ if (tail == &list->list_head)
+ return (NULL);
+ list_remove_node(tail);
+ return (list_object(list, tail));
+}
+
+void *
+list_head(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_next));
+}
+
+void *
+list_tail(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_prev));
+}
+
+void *
+list_next(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_next != &list->list_head)
+ return (list_object(list, node->list_next));
+
+ return (NULL);
+}
+
+void *
+list_prev(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_prev != &list->list_head)
+ return (list_object(list, node->list_prev));
+
+ return (NULL);
+}
+
+/*
+ * Insert src list after dst list. Empty src list thereafter.
+ */
+void
+list_move_tail(list_t *dst, list_t *src)
+{
+ list_node_t *dstnode = &dst->list_head;
+ list_node_t *srcnode = &src->list_head;
+
+ ASSERT(dst->list_size == src->list_size);
+ ASSERT(dst->list_offset == src->list_offset);
+
+ if (list_empty(src))
+ return;
+
+ dstnode->list_prev->list_next = srcnode->list_next;
+ srcnode->list_next->list_prev = dstnode->list_prev;
+ dstnode->list_prev = srcnode->list_prev;
+ srcnode->list_prev->list_next = dstnode;
+
+ /* empty src list */
+ srcnode->list_next = srcnode->list_prev = srcnode;
+}
+
+void
+list_link_replace(list_node_t *lold, list_node_t *lnew)
+{
+ ASSERT(list_link_active(lold));
+ ASSERT(!list_link_active(lnew));
+
+ lnew->list_next = lold->list_next;
+ lnew->list_prev = lold->list_prev;
+ lold->list_prev->list_next = lnew;
+ lold->list_next->list_prev = lnew;
+ lold->list_next = lold->list_prev = NULL;
+}
+
+void
+list_link_init(list_node_t *link)
+{
+ link->list_next = NULL;
+ link->list_prev = NULL;
+}
+
+int
+list_link_active(list_node_t *link)
+{
+ return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+ return (list_empty(list));
+}
diff --git a/module/os/freebsd/spl/sha224.h b/module/os/freebsd/spl/sha224.h
new file mode 100644
index 000000000..0abd43068
--- /dev/null
+++ b/module/os/freebsd/spl/sha224.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA224_H_
+#define _SHA224_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA224_BLOCK_LENGTH 64
+#define SHA224_DIGEST_LENGTH 28
+#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA224Context {
+ uint32_t state[8];
+ uint64_t count;
+ uint8_t buf[SHA224_BLOCK_LENGTH];
+} SHA224_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA224_Init
+#define SHA224_Init _libmd_SHA224_Init
+#endif
+#ifndef SHA224_Update
+#define SHA224_Update _libmd_SHA224_Update
+#endif
+#ifndef SHA224_Final
+#define SHA224_Final _libmd_SHA224_Final
+#endif
+#ifndef SHA224_End
+#define SHA224_End _libmd_SHA224_End
+#endif
+#ifndef SHA224_Fd
+#define SHA224_Fd _libmd_SHA224_Fd
+#endif
+#ifndef SHA224_FdChunk
+#define SHA224_FdChunk _libmd_SHA224_FdChunk
+#endif
+#ifndef SHA224_File
+#define SHA224_File _libmd_SHA224_File
+#endif
+#ifndef SHA224_FileChunk
+#define SHA224_FileChunk _libmd_SHA224_FileChunk
+#endif
+#ifndef SHA224_Data
+#define SHA224_Data _libmd_SHA224_Data
+#endif
+
+#ifndef SHA224_version
+#define SHA224_version _libmd_SHA224_version
+#endif
+
+void SHA224_Init(SHA224_CTX *);
+void SHA224_Update(SHA224_CTX *, const void *, size_t);
+void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)],
+ SHA224_CTX *);
+#ifndef _KERNEL
+char *SHA224_End(SHA224_CTX *, char *);
+char *SHA224_Data(const void *, unsigned int, char *);
+char *SHA224_Fd(int, char *);
+char *SHA224_FdChunk(int, char *, off_t, off_t);
+char *SHA224_File(const char *, char *);
+char *SHA224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA224_H_ */
diff --git a/module/os/freebsd/spl/sha256.h b/module/os/freebsd/spl/sha256.h
new file mode 100644
index 000000000..193c0c025
--- /dev/null
+++ b/module/os/freebsd/spl/sha256.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA256_BLOCK_LENGTH 64
+#define SHA256_DIGEST_LENGTH 32
+#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA256Context {
+ uint32_t state[8];
+ uint64_t count;
+ uint8_t buf[SHA256_BLOCK_LENGTH];
+} SHA256_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA256_Init
+#define SHA256_Init _libmd_SHA256_Init
+#endif
+#ifndef SHA256_Update
+#define SHA256_Update _libmd_SHA256_Update
+#endif
+#ifndef SHA256_Final
+#define SHA256_Final _libmd_SHA256_Final
+#endif
+#ifndef SHA256_End
+#define SHA256_End _libmd_SHA256_End
+#endif
+#ifndef SHA256_Fd
+#define SHA256_Fd _libmd_SHA256_Fd
+#endif
+#ifndef SHA256_FdChunk
+#define SHA256_FdChunk _libmd_SHA256_FdChunk
+#endif
+#ifndef SHA256_File
+#define SHA256_File _libmd_SHA256_File
+#endif
+#ifndef SHA256_FileChunk
+#define SHA256_FileChunk _libmd_SHA256_FileChunk
+#endif
+#ifndef SHA256_Data
+#define SHA256_Data _libmd_SHA256_Data
+#endif
+
+#ifndef SHA256_Transform
+#define SHA256_Transform _libmd_SHA256_Transform
+#endif
+#ifndef SHA256_version
+#define SHA256_version _libmd_SHA256_version
+#endif
+
+void SHA256_Init(SHA256_CTX *);
+void SHA256_Update(SHA256_CTX *, const void *, size_t);
+void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)],
+ SHA256_CTX *);
+#ifndef _KERNEL
+char *SHA256_End(SHA256_CTX *, char *);
+char *SHA256_Data(const void *, unsigned int, char *);
+char *SHA256_Fd(int, char *);
+char *SHA256_FdChunk(int, char *, off_t, off_t);
+char *SHA256_File(const char *, char *);
+char *SHA256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA256_H_ */
diff --git a/module/os/freebsd/spl/sha256c.c b/module/os/freebsd/spl/sha256c.c
new file mode 100644
index 000000000..241cf8c9a
--- /dev/null
+++ b/module/os/freebsd/spl/sha256c.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+
+#include <sys/byteorder.h>
+#include <sys/endian.h>
+#include "sha224.h"
+#include "sha256.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint32_t into a vector of bytes */
+#define be32enc_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint32_t */
+#define be32dec_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form. Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 4; i++)
+ be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t). Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 4; i++)
+ dst[i] = be32dec(src + i * 4);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA256 round constants. */
+static const uint32_t K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z) ((x & (y | z)) | (y & z))
+#define SHR(x, n) (x >> n)
+#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
+#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+ h += S1(e) + Ch(e, f, g) + k; \
+ d += h; \
+ h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, ii) \
+ RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+ S[(66 - i) % 8], S[(67 - i) % 8], \
+ S[(68 - i) % 8], S[(69 - i) % 8], \
+ S[(70 - i) % 8], S[(71 - i) % 8], \
+ W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define MSCH(W, ii, i) \
+ W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \
+ s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA256 block compression function. The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t *state, const unsigned char block[64])
+{
+ uint32_t W[64];
+ uint32_t S[8];
+ int i;
+
+ /* 1. Prepare the first part of the message schedule W. */
+ be32dec_vect(W, block, 64);
+
+ /* 2. Initialize working variables. */
+ memcpy(S, state, 32);
+
+ /* 3. Mix. */
+ for (i = 0; i < 64; i += 16) {
+ RNDr(S, W, 0, i);
+ RNDr(S, W, 1, i);
+ RNDr(S, W, 2, i);
+ RNDr(S, W, 3, i);
+ RNDr(S, W, 4, i);
+ RNDr(S, W, 5, i);
+ RNDr(S, W, 6, i);
+ RNDr(S, W, 7, i);
+ RNDr(S, W, 8, i);
+ RNDr(S, W, 9, i);
+ RNDr(S, W, 10, i);
+ RNDr(S, W, 11, i);
+ RNDr(S, W, 12, i);
+ RNDr(S, W, 13, i);
+ RNDr(S, W, 14, i);
+ RNDr(S, W, 15, i);
+
+ if (i == 48)
+ break;
+ MSCH(W, 0, i);
+ MSCH(W, 1, i);
+ MSCH(W, 2, i);
+ MSCH(W, 3, i);
+ MSCH(W, 4, i);
+ MSCH(W, 5, i);
+ MSCH(W, 6, i);
+ MSCH(W, 7, i);
+ MSCH(W, 8, i);
+ MSCH(W, 9, i);
+ MSCH(W, 10, i);
+ MSCH(W, 11, i);
+ MSCH(W, 12, i);
+ MSCH(W, 13, i);
+ MSCH(W, 14, i);
+ MSCH(W, 15, i);
+ }
+
+ /* 4. Mix local working variables into global state */
+ for (i = 0; i < 8; i++)
+ state[i] += S[i];
+}
+
+static unsigned char PAD[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx)
+{
+ size_t r;
+
+ /* Figure out how many bytes we have buffered. */
+ r = (ctx->count >> 3) & 0x3f;
+
+ /* Pad to 56 mod 64, transforming if we finish a block en route. */
+ if (r < 56) {
+ /* Pad to 56 mod 64. */
+ memcpy(&ctx->buf[r], PAD, 56 - r);
+ } else {
+ /* Finish the current block and mix. */
+ memcpy(&ctx->buf[r], PAD, 64 - r);
+ SHA256_Transform(ctx->state, ctx->buf);
+
+ /* The start of the final block is all zeroes. */
+ memset(&ctx->buf[0], 0, 56);
+ }
+
+ /* Add the terminating bit-count. */
+ be64enc(&ctx->buf[56], ctx->count);
+
+ /* Mix in the final block. */
+ SHA256_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-256 initialization. Begins a SHA-256 operation. */
+void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x6A09E667;
+ ctx->state[1] = 0xBB67AE85;
+ ctx->state[2] = 0x3C6EF372;
+ ctx->state[3] = 0xA54FF53A;
+ ctx->state[4] = 0x510E527F;
+ ctx->state[5] = 0x9B05688C;
+ ctx->state[6] = 0x1F83D9AB;
+ ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len)
+{
+ uint64_t bitlen;
+ uint32_t r;
+ const unsigned char *src = in;
+
+ /* Number of bytes left in the buffer from previous updates */
+ r = (ctx->count >> 3) & 0x3f;
+
+ /* Convert the length into a number of bits */
+ bitlen = len << 3;
+
+ /* Update number of bits */
+ ctx->count += bitlen;
+
+ /* Handle the case where we don't need to perform any transforms */
+ if (len < 64 - r) {
+ memcpy(&ctx->buf[r], src, len);
+ return;
+ }
+
+ /* Finish the current block */
+ memcpy(&ctx->buf[r], src, 64 - r);
+ SHA256_Transform(ctx->state, ctx->buf);
+ src += 64 - r;
+ len -= 64 - r;
+
+ /* Perform complete blocks */
+ while (len >= 64) {
+ SHA256_Transform(ctx->state, src);
+ src += 64;
+ len -= 64;
+ }
+
+ /* Copy left over data into buffer */
+ memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA256_Pad(ctx);
+
+ /* Write the hash */
+ be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-224: ******************************************************* */
+/*
+ * the SHA224 and SHA256 transforms are identical
+ */
+
+/* SHA-224 initialization. Begins a SHA-224 operation. */
+void
+SHA224_Init(SHA224_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0xC1059ED8;
+ ctx->state[1] = 0x367CD507;
+ ctx->state[2] = 0x3070DD17;
+ ctx->state[3] = 0xF70E5939;
+ ctx->state[4] = 0xFFC00B31;
+ ctx->state[5] = 0x68581511;
+ ctx->state[6] = 0x64f98FA7;
+ ctx->state[7] = 0xBEFA4FA4;
+}
+
+/* Add bytes into the SHA-224 hash */
+void
+SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA256_Update((SHA256_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-224 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA256_Pad((SHA256_CTX *)ctx);
+
+ /* Write the hash */
+ be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#ifdef WEAK_REFS
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA256_Init
+__weak_reference(_libmd_SHA256_Init, SHA256_Init);
+#undef SHA256_Update
+__weak_reference(_libmd_SHA256_Update, SHA256_Update);
+#undef SHA256_Final
+__weak_reference(_libmd_SHA256_Final, SHA256_Final);
+#undef SHA256_Transform
+__weak_reference(_libmd_SHA256_Transform, SHA256_Transform);
+
+#undef SHA224_Init
+__weak_reference(_libmd_SHA224_Init, SHA224_Init);
+#undef SHA224_Update
+__weak_reference(_libmd_SHA224_Update, SHA224_Update);
+#undef SHA224_Final
+__weak_reference(_libmd_SHA224_Final, SHA224_Final);
+#endif
diff --git a/module/os/freebsd/spl/sha384.h b/module/os/freebsd/spl/sha384.h
new file mode 100644
index 000000000..67250cee0
--- /dev/null
+++ b/module/os/freebsd/spl/sha384.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA384_H_
+#define _SHA384_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA384_BLOCK_LENGTH 128
+#define SHA384_DIGEST_LENGTH 48
+#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA384Context {
+ uint64_t state[8];
+ uint64_t count[2];
+ uint8_t buf[SHA384_BLOCK_LENGTH];
+} SHA384_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA384_Init
+#define SHA384_Init _libmd_SHA384_Init
+#endif
+#ifndef SHA384_Update
+#define SHA384_Update _libmd_SHA384_Update
+#endif
+#ifndef SHA384_Final
+#define SHA384_Final _libmd_SHA384_Final
+#endif
+#ifndef SHA384_End
+#define SHA384_End _libmd_SHA384_End
+#endif
+#ifndef SHA384_Fd
+#define SHA384_Fd _libmd_SHA384_Fd
+#endif
+#ifndef SHA384_FdChunk
+#define SHA384_FdChunk _libmd_SHA384_FdChunk
+#endif
+#ifndef SHA384_File
+#define SHA384_File _libmd_SHA384_File
+#endif
+#ifndef SHA384_FileChunk
+#define SHA384_FileChunk _libmd_SHA384_FileChunk
+#endif
+#ifndef SHA384_Data
+#define SHA384_Data _libmd_SHA384_Data
+#endif
+
+#ifndef SHA384_version
+#define SHA384_version _libmd_SHA384_version
+#endif
+
+void SHA384_Init(SHA384_CTX *);
+void SHA384_Update(SHA384_CTX *, const void *, size_t);
+void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)],
+ SHA384_CTX *);
+#ifndef _KERNEL
+char *SHA384_End(SHA384_CTX *, char *);
+char *SHA384_Data(const void *, unsigned int, char *);
+char *SHA384_Fd(int, char *);
+char *SHA384_FdChunk(int, char *, off_t, off_t);
+char *SHA384_File(const char *, char *);
+char *SHA384_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA384_H_ */
diff --git a/module/os/freebsd/spl/sha512.h b/module/os/freebsd/spl/sha512.h
new file mode 100644
index 000000000..b6fb733ca
--- /dev/null
+++ b/module/os/freebsd/spl/sha512.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512_H_
+#define _SHA512_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA512_BLOCK_LENGTH 128
+#define SHA512_DIGEST_LENGTH 64
+#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA512Context {
+ uint64_t state[8];
+ uint64_t count[2];
+ uint8_t buf[SHA512_BLOCK_LENGTH];
+} SHA512_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#if 0
+#ifndef SHA512_Init
+#define SHA512_Init _libmd_SHA512_Init
+#endif
+#ifndef SHA512_Update
+#define SHA512_Update _libmd_SHA512_Update
+#endif
+#ifndef SHA512_Final
+#define SHA512_Final _libmd_SHA512_Final
+#endif
+#endif
+#ifndef SHA512_End
+#define SHA512_End _libmd_SHA512_End
+#endif
+#ifndef SHA512_Fd
+#define SHA512_Fd _libmd_SHA512_Fd
+#endif
+#ifndef SHA512_FdChunk
+#define SHA512_FdChunk _libmd_SHA512_FdChunk
+#endif
+#ifndef SHA512_File
+#define SHA512_File _libmd_SHA512_File
+#endif
+#ifndef SHA512_FileChunk
+#define SHA512_FileChunk _libmd_SHA512_FileChunk
+#endif
+#ifndef SHA512_Data
+#define SHA512_Data _libmd_SHA512_Data
+#endif
+
+#ifndef SHA512_Transform
+#define SHA512_Transform _libmd_SHA512_Transform
+#endif
+#ifndef SHA512_version
+#define SHA512_version _libmd_SHA512_version
+#endif
+
+void SHA512_Init(SHA512_CTX *);
+void SHA512_Update(SHA512_CTX *, const void *, size_t);
+void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)],
+ SHA512_CTX *);
+#ifndef _KERNEL
+char *SHA512_End(SHA512_CTX *, char *);
+char *SHA512_Data(const void *, unsigned int, char *);
+char *SHA512_Fd(int, char *);
+char *SHA512_FdChunk(int, char *, off_t, off_t);
+char *SHA512_File(const char *, char *);
+char *SHA512_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512_H_ */
diff --git a/module/os/freebsd/spl/sha512c.c b/module/os/freebsd/spl/sha512c.c
new file mode 100644
index 000000000..146f338f0
--- /dev/null
+++ b/module/os/freebsd/spl/sha512c.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright 2005 Colin Percival
+ * Copyright (c) 2015 Allan Jude <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include "sha512.h"
+#include "sha512t.h"
+#include "sha384.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint64_t into a vector of bytes */
+#define be64enc_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint64_t */
+#define be64dec_vect(dst, src, len) \
+ memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint64_t) into a length len vector of
+ * (unsigned char) in big-endian form. Assumes len is a multiple of 8.
+ */
+static void
+be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 8; i++)
+ be64enc(dst + i * 8, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint64_t). Assumes len is a multiple of 8.
+ */
+static void
+be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len / 8; i++)
+ dst[i] = be64dec(src + i * 8);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA512 round constants. */
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+ 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+ 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+ 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+ 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+ 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+ 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+ 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+ 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+ 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+ 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+ 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+ 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+ 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+ 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+ 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+ 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+ 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+ 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+ 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+ 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+ 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+ 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+ 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+ 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+ 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+ 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+/* Elementary functions used by SHA512 */
+#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z) ((x & (y | z)) | (y & z))
+#define SHR(x, n) (x >> n)
+#define ROTR(x, n) ((x >> n) | (x << (64 - n)))
+#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39))
+#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41))
+#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
+#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6))
+
+/* SHA512 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+ h += S1(e) + Ch(e, f, g) + k; \
+ d += h; \
+ h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, ii) \
+ RND(S[(80 - i) % 8], S[(81 - i) % 8], \
+ S[(82 - i) % 8], S[(83 - i) % 8], \
+ S[(84 - i) % 8], S[(85 - i) % 8], \
+ S[(86 - i) % 8], S[(87 - i) % 8], \
+ W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define MSCH(W, ii, i) \
+ W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \
+ s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA512 block compression function. The 512-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA512_Transform(uint64_t *state,
+ const unsigned char block[SHA512_BLOCK_LENGTH])
+{
+ uint64_t W[80];
+ uint64_t S[8];
+ int i;
+
+ /* 1. Prepare the first part of the message schedule W. */
+ be64dec_vect(W, block, SHA512_BLOCK_LENGTH);
+
+ /* 2. Initialize working variables. */
+ memcpy(S, state, SHA512_DIGEST_LENGTH);
+
+ /* 3. Mix. */
+ for (i = 0; i < 80; i += 16) {
+ RNDr(S, W, 0, i);
+ RNDr(S, W, 1, i);
+ RNDr(S, W, 2, i);
+ RNDr(S, W, 3, i);
+ RNDr(S, W, 4, i);
+ RNDr(S, W, 5, i);
+ RNDr(S, W, 6, i);
+ RNDr(S, W, 7, i);
+ RNDr(S, W, 8, i);
+ RNDr(S, W, 9, i);
+ RNDr(S, W, 10, i);
+ RNDr(S, W, 11, i);
+ RNDr(S, W, 12, i);
+ RNDr(S, W, 13, i);
+ RNDr(S, W, 14, i);
+ RNDr(S, W, 15, i);
+
+ if (i == 64)
+ break;
+ MSCH(W, 0, i);
+ MSCH(W, 1, i);
+ MSCH(W, 2, i);
+ MSCH(W, 3, i);
+ MSCH(W, 4, i);
+ MSCH(W, 5, i);
+ MSCH(W, 6, i);
+ MSCH(W, 7, i);
+ MSCH(W, 8, i);
+ MSCH(W, 9, i);
+ MSCH(W, 10, i);
+ MSCH(W, 11, i);
+ MSCH(W, 12, i);
+ MSCH(W, 13, i);
+ MSCH(W, 14, i);
+ MSCH(W, 15, i);
+ }
+
+ /* 4. Mix local working variables into global state */
+ for (i = 0; i < 8; i++)
+ state[i] += S[i];
+}
+
+static unsigned char PAD[SHA512_BLOCK_LENGTH] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA512_Pad(SHA512_CTX * ctx)
+{
+ size_t r;
+
+ /* Figure out how many bytes we have buffered. */
+ r = (ctx->count[1] >> 3) & 0x7f;
+
+ /* Pad to 112 mod 128, transforming if we finish a block en route. */
+ if (r < 112) {
+ /* Pad to 112 mod 128. */
+ memcpy(&ctx->buf[r], PAD, 112 - r);
+ } else {
+ /* Finish the current block and mix. */
+ memcpy(&ctx->buf[r], PAD, 128 - r);
+ SHA512_Transform(ctx->state, ctx->buf);
+
+ /* The start of the final block is all zeroes. */
+ memset(&ctx->buf[0], 0, 112);
+ }
+
+ /* Add the terminating bit-count. */
+ be64enc_vect(&ctx->buf[112], ctx->count, 16);
+
+ /* Mix in the final block. */
+ SHA512_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-512 initialization. Begins a SHA-512 operation. */
+void
+SHA512_Init(SHA512_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x6a09e667f3bcc908ULL;
+ ctx->state[1] = 0xbb67ae8584caa73bULL;
+ ctx->state[2] = 0x3c6ef372fe94f82bULL;
+ ctx->state[3] = 0xa54ff53a5f1d36f1ULL;
+ ctx->state[4] = 0x510e527fade682d1ULL;
+ ctx->state[5] = 0x9b05688c2b3e6c1fULL;
+ ctx->state[6] = 0x1f83d9abfb41bd6bULL;
+ ctx->state[7] = 0x5be0cd19137e2179ULL;
+}
+
+/* Add bytes into the hash */
+void
+SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+ uint64_t bitlen[2];
+ uint64_t r;
+ const unsigned char *src = in;
+
+ /* Number of bytes left in the buffer from previous updates */
+ r = (ctx->count[1] >> 3) & 0x7f;
+
+ /* Convert the length into a number of bits */
+ bitlen[1] = ((uint64_t)len) << 3;
+ bitlen[0] = ((uint64_t)len) >> 61;
+
+ /* Update number of bits */
+ if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+ ctx->count[0]++;
+ ctx->count[0] += bitlen[0];
+
+ /* Handle the case where we don't need to perform any transforms */
+ if (len < SHA512_BLOCK_LENGTH - r) {
+ memcpy(&ctx->buf[r], src, len);
+ return;
+ }
+
+ /* Finish the current block */
+ memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r);
+ SHA512_Transform(ctx->state, ctx->buf);
+ src += SHA512_BLOCK_LENGTH - r;
+ len -= SHA512_BLOCK_LENGTH - r;
+
+ /* Perform complete blocks */
+ while (len >= SHA512_BLOCK_LENGTH) {
+ SHA512_Transform(ctx->state, src);
+ src += SHA512_BLOCK_LENGTH;
+ len -= SHA512_BLOCK_LENGTH;
+ }
+
+ /* Copy left over data into buffer */
+ memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-512 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad(ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-512t: ******************************************************** */
+/*
+ * the SHA512t transforms are identical to SHA512 so reuse the existing function
+ */
+void
+SHA512_224_Init(SHA512_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x8c3d37c819544da2ULL;
+ ctx->state[1] = 0x73e1996689dcd4d6ULL;
+ ctx->state[2] = 0x1dfab7ae32ff9c82ULL;
+ ctx->state[3] = 0x679dd514582f9fcfULL;
+ ctx->state[4] = 0x0f6d2b697bd44da8ULL;
+ ctx->state[5] = 0x77e36f7304c48942ULL;
+ ctx->state[6] = 0x3f9d85a86a1d36c8ULL;
+ ctx->state[7] = 0x1112e6ad91d692a1ULL;
+}
+
+void
+SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH],
+ SHA512_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad(ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+void
+SHA512_256_Init(SHA512_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0x22312194fc2bf72cULL;
+ ctx->state[1] = 0x9f555fa3c84c64c2ULL;
+ ctx->state[2] = 0x2393b86b6f53b151ULL;
+ ctx->state[3] = 0x963877195940eabdULL;
+ ctx->state[4] = 0x96283ee2a88effe3ULL;
+ ctx->state[5] = 0xbe5e1e2553863992ULL;
+ ctx->state[6] = 0x2b0199fc2c85b8aaULL;
+ ctx->state[7] = 0x0eb72ddc81c52ca2ULL;
+}
+
+void
+SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH],
+ SHA512_CTX * ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad(ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* ** SHA-384: ******************************************************** */
+/*
+ * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped
+ */
+
+/* SHA-384 initialization. Begins a SHA-384 operation. */
+void
+SHA384_Init(SHA384_CTX * ctx)
+{
+
+ /* Zero bits processed so far */
+ ctx->count[0] = ctx->count[1] = 0;
+
+ /* Magic initialization constants */
+ ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
+ ctx->state[1] = 0x629a292a367cd507ULL;
+ ctx->state[2] = 0x9159015a3070dd17ULL;
+ ctx->state[3] = 0x152fecd8f70e5939ULL;
+ ctx->state[4] = 0x67332667ffc00b31ULL;
+ ctx->state[5] = 0x8eb44a8768581511ULL;
+ ctx->state[6] = 0xdb0c2e0d64f98fa7ULL;
+ ctx->state[7] = 0x47b5481dbefa4fa4ULL;
+}
+
+/* Add bytes into the SHA-384 hash */
+void
+SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len)
+{
+
+ SHA512_Update((SHA512_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-384 finalization. Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx)
+{
+
+ /* Add padding */
+ SHA512_Pad((SHA512_CTX *)ctx);
+
+ /* Write the hash */
+ be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH);
+
+ /* Clear the context state */
+ explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#if 0
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA512_Init
+__weak_reference(_libmd_SHA512_Init, SHA512_Init);
+#undef SHA512_Update
+__weak_reference(_libmd_SHA512_Update, SHA512_Update);
+#undef SHA512_Final
+__weak_reference(_libmd_SHA512_Final, SHA512_Final);
+#undef SHA512_Transform
+__weak_reference(_libmd_SHA512_Transform, SHA512_Transform);
+
+#undef SHA512_224_Init
+__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init);
+#undef SHA512_224_Update
+__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update);
+#undef SHA512_224_Final
+__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final);
+
+#undef SHA512_256_Init
+__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init);
+#undef SHA512_256_Update
+__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update);
+#undef SHA512_256_Final
+__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final);
+
+#undef SHA384_Init
+__weak_reference(_libmd_SHA384_Init, SHA384_Init);
+#undef SHA384_Update
+__weak_reference(_libmd_SHA384_Update, SHA384_Update);
+#undef SHA384_Final
+__weak_reference(_libmd_SHA384_Final, SHA384_Final);
+#endif
diff --git a/module/os/freebsd/spl/sha512t.h b/module/os/freebsd/spl/sha512t.h
new file mode 100644
index 000000000..703867fc0
--- /dev/null
+++ b/module/os/freebsd/spl/sha512t.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Allan Jude <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512T_H_
+#define _SHA512T_H_
+
+#include "sha512.h"
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define SHA512_224_DIGEST_LENGTH 28
+#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1)
+#define SHA512_256_DIGEST_LENGTH 32
+#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1)
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA512_224_Init
+#define SHA512_224_Init _libmd_SHA512_224_Init
+#endif
+#ifndef SHA512_224_Update
+#define SHA512_224_Update _libmd_SHA512_224_Update
+#endif
+#ifndef SHA512_224_Final
+#define SHA512_224_Final _libmd_SHA512_224_Final
+#endif
+#ifndef SHA512_224_End
+#define SHA512_224_End _libmd_SHA512_224_End
+#endif
+#ifndef SHA512_224_Fd
+#define SHA512_224_Fd _libmd_SHA512_224_Fd
+#endif
+#ifndef SHA512_224_FdChunk
+#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk
+#endif
+#ifndef SHA512_224_File
+#define SHA512_224_File _libmd_SHA512_224_File
+#endif
+#ifndef SHA512_224_FileChunk
+#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk
+#endif
+#ifndef SHA512_224_Data
+#define SHA512_224_Data _libmd_SHA512_224_Data
+#endif
+
+#ifndef SHA512_224_Transform
+#define SHA512_224_Transform _libmd_SHA512_224_Transform
+#endif
+#ifndef SHA512_224_version
+#define SHA512_224_version _libmd_SHA512_224_version
+#endif
+
+#ifndef SHA512_256_Init
+#define SHA512_256_Init _libmd_SHA512_256_Init
+#endif
+#ifndef SHA512_256_Update
+#define SHA512_256_Update _libmd_SHA512_256_Update
+#endif
+#ifndef SHA512_256_Final
+#define SHA512_256_Final _libmd_SHA512_256_Final
+#endif
+#ifndef SHA512_256_End
+#define SHA512_256_End _libmd_SHA512_256_End
+#endif
+#ifndef SHA512_256_Fd
+#define SHA512_256_Fd _libmd_SHA512_256_Fd
+#endif
+#ifndef SHA512_256_FdChunk
+#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk
+#endif
+#ifndef SHA512_256_File
+#define SHA512_256_File _libmd_SHA512_256_File
+#endif
+#ifndef SHA512_256_FileChunk
+#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk
+#endif
+#ifndef SHA512_256_Data
+#define SHA512_256_Data _libmd_SHA512_256_Data
+#endif
+
+#ifndef SHA512_256_Transform
+#define SHA512_256_Transform _libmd_SHA512_256_Transform
+#endif
+#ifndef SHA512_256_version
+#define SHA512_256_version _libmd_SHA512_256_version
+#endif
+
+void SHA512_224_Init(SHA512_CTX *);
+void SHA512_224_Update(SHA512_CTX *, const void *, size_t);
+void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)],
+ SHA512_CTX *);
+#ifndef _KERNEL
+char *SHA512_224_End(SHA512_CTX *, char *);
+char *SHA512_224_Data(const void *, unsigned int, char *);
+char *SHA512_224_Fd(int, char *);
+char *SHA512_224_FdChunk(int, char *, off_t, off_t);
+char *SHA512_224_File(const char *, char *);
+char *SHA512_224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+void SHA512_256_Init(SHA512_CTX *);
+void SHA512_256_Update(SHA512_CTX *, const void *, size_t);
+void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)],
+ SHA512_CTX *);
+#ifndef _KERNEL
+char *SHA512_256_End(SHA512_CTX *, char *);
+char *SHA512_256_Data(const void *, unsigned int, char *);
+char *SHA512_256_Fd(int, char *);
+char *SHA512_256_FdChunk(int, char *, off_t, off_t);
+char *SHA512_256_File(const char *, char *);
+char *SHA512_256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512T_H_ */
diff --git a/module/os/freebsd/spl/spl_acl.c b/module/os/freebsd/spl/spl_acl.c
new file mode 100644
index 000000000..bb4c30728
--- /dev/null
+++ b/module/os/freebsd/spl/spl_acl.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008, 2009 Edward Tomasz NapieraÅ‚a <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/errno.h>
+#include <sys/zfs_acl.h>
+#include <sys/acl.h>
+
+struct zfs2bsd {
+ uint32_t zb_zfs;
+ int zb_bsd;
+};
+
+struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA},
+ {ACE_WRITE_DATA, ACL_WRITE_DATA},
+ {ACE_EXECUTE, ACL_EXECUTE},
+ {ACE_APPEND_DATA, ACL_APPEND_DATA},
+ {ACE_DELETE_CHILD, ACL_DELETE_CHILD},
+ {ACE_DELETE, ACL_DELETE},
+ {ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES},
+ {ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
+ {ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
+ {ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
+ {ACE_READ_ACL, ACL_READ_ACL},
+ {ACE_WRITE_ACL, ACL_WRITE_ACL},
+ {ACE_WRITE_OWNER, ACL_WRITE_OWNER},
+ {ACE_SYNCHRONIZE, ACL_SYNCHRONIZE},
+ {0, 0}};
+
+struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE,
+ ACL_ENTRY_FILE_INHERIT},
+ {ACE_DIRECTORY_INHERIT_ACE,
+ ACL_ENTRY_DIRECTORY_INHERIT},
+ {ACE_NO_PROPAGATE_INHERIT_ACE,
+ ACL_ENTRY_NO_PROPAGATE_INHERIT},
+ {ACE_INHERIT_ONLY_ACE,
+ ACL_ENTRY_INHERIT_ONLY},
+ {ACE_INHERITED_ACE,
+ ACL_ENTRY_INHERITED},
+ {ACE_SUCCESSFUL_ACCESS_ACE_FLAG,
+ ACL_ENTRY_SUCCESSFUL_ACCESS},
+ {ACE_FAILED_ACCESS_ACE_FLAG,
+ ACL_ENTRY_FAILED_ACCESS},
+ {0, 0}};
+
+static int
+_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table)
+{
+ const struct zfs2bsd *tmp;
+ int bsd = 0;
+
+ for (tmp = table; tmp->zb_zfs != 0; tmp++) {
+ if (zfs & tmp->zb_zfs)
+ bsd |= tmp->zb_bsd;
+ }
+
+ return (bsd);
+}
+
+static uint32_t
+_zfs_from_bsd(int bsd, const struct zfs2bsd *table)
+{
+ const struct zfs2bsd *tmp;
+ uint32_t zfs = 0;
+
+ for (tmp = table; tmp->zb_bsd != 0; tmp++) {
+ if (bsd & tmp->zb_bsd)
+ zfs |= tmp->zb_zfs;
+ }
+
+ return (zfs);
+}
+
+int
+acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries)
+{
+ int i;
+ struct acl_entry *entry;
+ const ace_t *ace;
+
+ if (nentries < 1) {
+ printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n");
+ return (EINVAL);
+ }
+
+ if (nentries > ACL_MAX_ENTRIES) {
+ /*
+ * I believe it may happen only when moving a pool
+ * from SunOS to FreeBSD.
+ */
+ printf("acl_from_aces: ZFS ACL too big to fit "
+ "into 'struct acl'; returning EINVAL.\n");
+ return (EINVAL);
+ }
+
+ bzero(aclp, sizeof (*aclp));
+ aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+ aclp->acl_cnt = nentries;
+
+ for (i = 0; i < nentries; i++) {
+ entry = &(aclp->acl_entry[i]);
+ ace = &(aces[i]);
+
+ if (ace->a_flags & ACE_OWNER)
+ entry->ae_tag = ACL_USER_OBJ;
+ else if (ace->a_flags & ACE_GROUP)
+ entry->ae_tag = ACL_GROUP_OBJ;
+ else if (ace->a_flags & ACE_EVERYONE)
+ entry->ae_tag = ACL_EVERYONE;
+ else if (ace->a_flags & ACE_IDENTIFIER_GROUP)
+ entry->ae_tag = ACL_GROUP;
+ else
+ entry->ae_tag = ACL_USER;
+
+ if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP)
+ entry->ae_id = ace->a_who;
+ else
+ entry->ae_id = ACL_UNDEFINED_ID;
+
+ entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms);
+ entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags);
+
+ switch (ace->a_type) {
+ case ACE_ACCESS_ALLOWED_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW;
+ break;
+ case ACE_ACCESS_DENIED_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_DENY;
+ break;
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT;
+ break;
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM;
+ break;
+ default:
+ panic("acl_from_aces: a_type is 0x%x", ace->a_type);
+ }
+ }
+
+ return (0);
+}
+
+void
+aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp)
+{
+ int i;
+ const struct acl_entry *entry;
+ ace_t *ace;
+
+ bzero(aces, sizeof (*aces) * aclp->acl_cnt);
+
+ *nentries = aclp->acl_cnt;
+
+ for (i = 0; i < aclp->acl_cnt; i++) {
+ entry = &(aclp->acl_entry[i]);
+ ace = &(aces[i]);
+
+ ace->a_who = entry->ae_id;
+
+ if (entry->ae_tag == ACL_USER_OBJ)
+ ace->a_flags = ACE_OWNER;
+ else if (entry->ae_tag == ACL_GROUP_OBJ)
+ ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP);
+ else if (entry->ae_tag == ACL_GROUP)
+ ace->a_flags = ACE_IDENTIFIER_GROUP;
+ else if (entry->ae_tag == ACL_EVERYONE)
+ ace->a_flags = ACE_EVERYONE;
+ else /* ACL_USER */
+ ace->a_flags = 0;
+
+ ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms);
+ ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags);
+
+ switch (entry->ae_entry_type) {
+ case ACL_ENTRY_TYPE_ALLOW:
+ ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+ break;
+ case ACL_ENTRY_TYPE_DENY:
+ ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ break;
+ case ACL_ENTRY_TYPE_ALARM:
+ ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE;
+ break;
+ case ACL_ENTRY_TYPE_AUDIT:
+ ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE;
+ break;
+ default:
+ panic("aces_from_acl: ae_entry_type is 0x%x",
+ entry->ae_entry_type);
+ }
+ }
+}
diff --git a/module/os/freebsd/spl/spl_atomic.c b/module/os/freebsd/spl/spl_atomic.c
new file mode 100644
index 000000000..e82fed847
--- /dev/null
+++ b/module/os/freebsd/spl/spl_atomic.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/atomic.h>
+
+#ifdef _KERNEL
+#include <sys/kernel.h>
+
+struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
+#else
+#include <pthread.h>
+
+#define mtx_lock(lock) pthread_mutex_lock(lock)
+#define mtx_unlock(lock) pthread_mutex_unlock(lock)
+
+static pthread_mutex_t atomic_mtx;
+
+static __attribute__((constructor)) void
+atomic_init(void)
+{
+ pthread_mutex_init(&atomic_mtx, NULL);
+}
+#endif
+
+#if !defined(__LP64__) && !defined(__mips_n32) && \
+ !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64)
+void
+atomic_add_64(volatile uint64_t *target, int64_t delta)
+{
+
+ mtx_lock(&atomic_mtx);
+ *target += delta;
+ mtx_unlock(&atomic_mtx);
+}
+
+void
+atomic_dec_64(volatile uint64_t *target)
+{
+
+ mtx_lock(&atomic_mtx);
+ *target -= 1;
+ mtx_unlock(&atomic_mtx);
+}
+#endif
+
+uint64_t
+atomic_add_64_nv(volatile uint64_t *target, int64_t delta)
+{
+ uint64_t newval;
+
+ mtx_lock(&atomic_mtx);
+ newval = (*target += delta);
+ mtx_unlock(&atomic_mtx);
+ return (newval);
+}
+
+#if defined(__powerpc__) || defined(__arm__) || defined(__mips__)
+void
+atomic_or_8(volatile uint8_t *target, uint8_t value)
+{
+ mtx_lock(&atomic_mtx);
+ *target |= value;
+ mtx_unlock(&atomic_mtx);
+}
+#endif
+
+uint8_t
+atomic_or_8_nv(volatile uint8_t *target, uint8_t value)
+{
+ uint8_t newval;
+
+ mtx_lock(&atomic_mtx);
+ newval = (*target |= value);
+ mtx_unlock(&atomic_mtx);
+ return (newval);
+}
+
+uint64_t
+atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
+{
+ uint64_t oldval;
+
+ mtx_lock(&atomic_mtx);
+ oldval = *target;
+ if (oldval == cmp)
+ *target = newval;
+ mtx_unlock(&atomic_mtx);
+ return (oldval);
+}
+
+uint32_t
+atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval)
+{
+ uint32_t oldval;
+
+ mtx_lock(&atomic_mtx);
+ oldval = *target;
+ if (oldval == cmp)
+ *target = newval;
+ mtx_unlock(&atomic_mtx);
+ return (oldval);
+}
+
+void
+membar_producer(void)
+{
+ /* nothing */
+}
diff --git a/module/os/freebsd/spl/spl_cmn_err.c b/module/os/freebsd/spl/spl_cmn_err.c
new file mode 100644
index 000000000..23566603f
--- /dev/null
+++ b/module/os/freebsd/spl/spl_cmn_err.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 John Birrell <[email protected]>. All rights reserved.
+ * Copyright 2012 Martin Matuska <[email protected]>. All rights reserved.
+ */
+
+#include <sys/cmn_err.h>
+
+void
+vcmn_err(int ce, const char *fmt, va_list adx)
+{
+ char buf[256];
+ const char *prefix;
+
+ prefix = NULL; /* silence unwitty compilers */
+ switch (ce) {
+ case CE_CONT:
+ prefix = "Solaris(cont): ";
+ break;
+ case CE_NOTE:
+ prefix = "Solaris: NOTICE: ";
+ break;
+ case CE_WARN:
+ prefix = "Solaris: WARNING: ";
+ break;
+ case CE_PANIC:
+ prefix = "Solaris(panic): ";
+ break;
+ case CE_IGNORE:
+ break;
+ default:
+ panic("Solaris: unknown severity level");
+ }
+ if (ce == CE_PANIC) {
+ vsnprintf(buf, sizeof (buf), fmt, adx);
+ panic("%s%s", prefix, buf);
+ }
+ if (ce != CE_IGNORE) {
+ printf("%s", prefix);
+ vprintf(fmt, adx);
+ printf("\n");
+ }
+}
+
+void
+cmn_err(int type, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vcmn_err(type, fmt, ap);
+ va_end(ap);
+}
diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c
new file mode 100644
index 000000000..e7b2ff823
--- /dev/null
+++ b/module/os/freebsd/spl/spl_dtrace.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2014 The FreeBSD Project.
+ * All rights reserved.
+ *
+ * This software was developed by Steven Hartland.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/sdt.h>
+
+/* CSTYLED */
+SDT_PROBE_DEFINE1(sdt, , , set__error, "int");
diff --git a/module/os/freebsd/spl/spl_kmem.c b/module/os/freebsd/spl/spl_kmem.c
new file mode 100644
index 000000000..af3747c27
--- /dev/null
+++ b/module/os/freebsd/spl/spl_kmem.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/debug.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+
+#ifdef KMEM_DEBUG
+#include <sys/queue.h>
+#include <sys/stack.h>
+#endif
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
+#else
+#define malloc(size, type, flags) malloc(size)
+#define free(addr, type) free(addr)
+#endif
+
+#ifdef KMEM_DEBUG
+struct kmem_item {
+ struct stack stack;
+ LIST_ENTRY(kmem_item) next;
+};
+static LIST_HEAD(, kmem_item) kmem_items;
+static struct mtx kmem_items_mtx;
+MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
+#endif /* KMEM_DEBUG */
+
+#include <sys/vmem.h>
+
+void *
+zfs_kmem_alloc(size_t size, int kmflags)
+{
+ void *p;
+#ifdef KMEM_DEBUG
+ struct kmem_item *i;
+
+ size += sizeof (struct kmem_item);
+#endif
+ p = malloc(MAX(size, 16), M_SOLARIS, kmflags);
+#ifndef _KERNEL
+ if (kmflags & KM_SLEEP)
+ assert(p != NULL);
+#endif
+#ifdef KMEM_DEBUG
+ if (p != NULL) {
+ i = p;
+ p = (uint8_t *)p + sizeof (struct kmem_item);
+ stack_save(&i->stack);
+ mtx_lock(&kmem_items_mtx);
+ LIST_INSERT_HEAD(&kmem_items, i, next);
+ mtx_unlock(&kmem_items_mtx);
+ }
+#endif
+ return (p);
+}
+
+void
+zfs_kmem_free(void *buf, size_t size __unused)
+{
+#ifdef KMEM_DEBUG
+ if (buf == NULL) {
+ printf("%s: attempt to free NULL\n", __func__);
+ return;
+ }
+ struct kmem_item *i;
+
+ buf = (uint8_t *)buf - sizeof (struct kmem_item);
+ mtx_lock(&kmem_items_mtx);
+ LIST_FOREACH(i, &kmem_items, next) {
+ if (i == buf)
+ break;
+ }
+ ASSERT(i != NULL);
+ LIST_REMOVE(i, next);
+ mtx_unlock(&kmem_items_mtx);
+ memset(buf, 0xDC, MAX(size, 16));
+#endif
+ free(buf, M_SOLARIS);
+}
+
+static uint64_t kmem_size_val;
+
+static void
+kmem_size_init(void *unused __unused)
+{
+
+ kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
+ if (kmem_size_val > vm_kmem_size)
+ kmem_size_val = vm_kmem_size;
+}
+SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
+
+uint64_t
+kmem_size(void)
+{
+
+ return (kmem_size_val);
+}
+
+static int
+kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
+{
+ struct kmem_cache *cache = private;
+
+ return (cache->kc_constructor(mem, cache->kc_private, flags));
+}
+
+static void
+kmem_std_destructor(void *mem, int size __unused, void *private)
+{
+ struct kmem_cache *cache = private;
+
+ cache->kc_destructor(mem, cache->kc_private);
+}
+
+kmem_cache_t *
+kmem_cache_create(char *name, size_t bufsize, size_t align,
+ int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
+ void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
+{
+ kmem_cache_t *cache;
+
+ ASSERT(vmp == NULL);
+
+ cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
+ strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
+ cache->kc_constructor = constructor;
+ cache->kc_destructor = destructor;
+ cache->kc_private = private;
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
+ constructor != NULL ? kmem_std_constructor : NULL,
+ destructor != NULL ? kmem_std_destructor : NULL,
+ NULL, NULL, align > 0 ? align - 1 : 0, cflags);
+#else
+ cache->kc_size = bufsize;
+#endif
+
+ return (cache);
+}
+
+void
+kmem_cache_destroy(kmem_cache_t *cache)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ uma_zdestroy(cache->kc_zone);
+#endif
+ kmem_free(cache, sizeof (*cache));
+}
+
+void *
+kmem_cache_alloc(kmem_cache_t *cache, int flags)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ return (uma_zalloc_arg(cache->kc_zone, cache, flags));
+#else
+ void *p;
+
+ p = kmem_alloc(cache->kc_size, flags);
+ if (p != NULL && cache->kc_constructor != NULL)
+ kmem_std_constructor(p, cache->kc_size, cache, flags);
+ return (p);
+#endif
+}
+
+void
+kmem_cache_free(kmem_cache_t *cache, void *buf)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ uma_zfree_arg(cache->kc_zone, buf, cache);
+#else
+ if (cache->kc_destructor != NULL)
+ kmem_std_destructor(buf, cache->kc_size, cache);
+ kmem_free(buf, cache->kc_size);
+#endif
+}
+
+/*
+ * Allow our caller to determine if there are running reaps.
+ *
+ * This call is very conservative and may return B_TRUE even when
+ * reaping activity isn't active. If it returns B_FALSE, then reaping
+ * activity is definitely inactive.
+ */
+boolean_t
+kmem_cache_reap_active(void)
+{
+
+ return (B_FALSE);
+}
+
+/*
+ * Reap (almost) everything soon.
+ *
+ * Note: this does not wait for the reap-tasks to complete. Caller
+ * should use kmem_cache_reap_active() (above) and/or moderation to
+ * avoid scheduling too many reap-tasks.
+ */
+#ifdef _KERNEL
+void
+kmem_cache_reap_soon(kmem_cache_t *cache)
+{
+#ifndef KMEM_DEBUG
+#if __FreeBSD_version >= 1300043
+ uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
+#else
+ zone_drain(cache->kc_zone);
+#endif
+#endif
+}
+
+void
+kmem_reap(void)
+{
+#if __FreeBSD_version >= 1300043
+ uma_reclaim(UMA_RECLAIM_TRIM);
+#else
+ uma_reclaim();
+#endif
+}
+#else
+void
+kmem_cache_reap_soon(kmem_cache_t *cache __unused)
+{
+}
+
+void
+kmem_reap(void)
+{
+}
+#endif
+
+int
+kmem_debugging(void)
+{
+ return (0);
+}
+
+void *
+calloc(size_t n, size_t s)
+{
+ return (kmem_zalloc(n * s, KM_NOSLEEP));
+}
+
+char *
+kmem_vasprintf(const char *fmt, va_list adx)
+{
+ char *msg;
+ va_list adx2;
+
+ va_copy(adx2, adx);
+ msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
+ (void) vsprintf(msg, fmt, adx2);
+ va_end(adx2);
+
+ return (msg);
+}
+
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#ifdef KMEM_DEBUG
+#error "KMEM_DEBUG not currently supported"
+#endif
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+ return (uma_zone_get_cur(cache->kc_zone));
+}
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+ return (cache->kc_zone->uz_size);
+}
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(kmem_cache_t *skc,
+ kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+ ASSERT(move != NULL);
+}
+
+#ifdef KMEM_DEBUG
+void kmem_show(void *);
+void
+kmem_show(void *dummy __unused)
+{
+ struct kmem_item *i;
+
+ mtx_lock(&kmem_items_mtx);
+ if (LIST_EMPTY(&kmem_items))
+ printf("KMEM_DEBUG: No leaked elements.\n");
+ else {
+ printf("KMEM_DEBUG: Leaked elements:\n\n");
+ LIST_FOREACH(i, &kmem_items, next) {
+ printf("address=%p\n", i);
+ stack_print_ddb(&i->stack);
+ printf("\n");
+ }
+ }
+ mtx_unlock(&kmem_items_mtx);
+}
+
+SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
+#endif /* KMEM_DEBUG */
diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c
new file mode 100644
index 000000000..fda03a3d7
--- /dev/null
+++ b/module/os/freebsd/spl/spl_kstat.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <sys/kstat.h>
+
+static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
+
+SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics");
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+ ASSERT(ksp != NULL);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+kstat_t *
+__kstat_create(const char *module, int instance, const char *name,
+ const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags)
+{
+ struct sysctl_oid *root;
+ kstat_t *ksp;
+
+ KASSERT(instance == 0, ("instance=%d", instance));
+ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+ ASSERT(ks_ndata == 1);
+
+ /*
+ * Allocate the main structure. We don't need to copy module/class/name
+ * stuff in here, because it is only used for sysctl node creation
+ * done in this function.
+ */
+ ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO);
+
+ ksp->ks_crtime = gethrtime();
+ ksp->ks_snaptime = ksp->ks_crtime;
+ ksp->ks_instance = instance;
+ strncpy(ksp->ks_name, name, KSTAT_STRLEN);
+ strncpy(ksp->ks_class, class, KSTAT_STRLEN);
+ ksp->ks_type = ks_type;
+ ksp->ks_flags = flags;
+ ksp->ks_update = kstat_default_update;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ ksp->ks_ndata = 1;
+ ksp->ks_data_size = ks_ndata;
+ break;
+ case KSTAT_TYPE_NAMED:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+ break;
+ default:
+ panic("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ ksp->ks_data = NULL;
+ } else {
+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+ if (ksp->ks_data == NULL) {
+ kmem_free(ksp, sizeof (*ksp));
+ ksp = NULL;
+ }
+ }
+ /*
+ * Create sysctl tree for those statistics:
+ *
+ * kstat.<module>.<class>.<name>.
+ */
+ sysctl_ctx_init(&ksp->ks_sysctl_ctx);
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0,
+ "");
+ if (root == NULL) {
+ printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
+ OID_AUTO, class, CTLFLAG_RW, 0, "");
+ if (root == NULL) {
+ printf("%s: Cannot create kstat.%s.%s tree!\n", __func__,
+ module, class);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+ root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
+ OID_AUTO, name, CTLFLAG_RW, 0, "");
+ if (root == NULL) {
+ printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__,
+ module, class, name);
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+ return (NULL);
+ }
+ ksp->ks_sysctl_root = root;
+
+ return (ksp);
+}
+
+static int
+kstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ kstat_named_t *ksent = arg1;
+ uint64_t val;
+
+ val = ksent->value.ui64;
+ return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+void
+kstat_install(kstat_t *ksp)
+{
+ kstat_named_t *ksent;
+ char *namelast;
+ int typelast;
+
+ ksent = ksp->ks_data;
+ if (ksp->ks_ndata == UINT32_MAX) {
+#ifdef INVARIANTS
+ printf("can't handle raw ops yet!!!\n");
+#endif
+ return;
+ }
+ if (ksent == NULL) {
+ printf("%s ksp->ks_data == NULL!!!!\n", __func__);
+ return;
+ }
+ typelast = 0;
+ namelast = NULL;
+ for (int i = 0; i < ksp->ks_ndata; i++, ksent++) {
+ if (ksent->data_type != 0) {
+ typelast = ksent->data_type;
+ namelast = ksent->name;
+ }
+ switch (typelast) {
+ case KSTAT_DATA_INT32:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_S32 | CTLFLAG_RD, ksent,
+ sizeof (*ksent), kstat_sysctl, "I",
+ namelast);
+ break;
+ case KSTAT_DATA_UINT32:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_U32 | CTLFLAG_RD, ksent,
+ sizeof (*ksent), kstat_sysctl, "IU",
+ namelast);
+ break;
+ case KSTAT_DATA_INT64:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_S64 | CTLFLAG_RD, ksent,
+ sizeof (*ksent), kstat_sysctl, "Q",
+ namelast);
+ break;
+ case KSTAT_DATA_UINT64:
+ SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+ SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+ OID_AUTO, namelast,
+ CTLTYPE_U64 | CTLFLAG_RD, ksent,
+ sizeof (*ksent), kstat_sysctl, "QU",
+ namelast);
+ break;
+ default:
+ panic("unsupported type: %d", typelast);
+ }
+
+ }
+}
+
+void
+kstat_delete(kstat_t *ksp)
+{
+
+ sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+ free(ksp, M_KSTAT);
+}
+
+void
+kstat_set_string(char *dst, const char *src)
+{
+
+ bzero(dst, KSTAT_STRLEN);
+ (void) strncpy(dst, src, KSTAT_STRLEN - 1);
+}
+
+void
+kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type)
+{
+
+ kstat_set_string(knp->name, name);
+ knp->data_type = data_type;
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt++;
+ if (wcnt != 0) {
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+ }
+}
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt--;
+ ASSERT((int)wcnt > 0);
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+}
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt++;
+ if (rcnt != 0) {
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+ }
+}
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt--;
+ ASSERT((int)rcnt > 0);
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+}
diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c
new file mode 100644
index 000000000..ab4702574
--- /dev/null
+++ b/module/os/freebsd/spl/spl_misc.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sysctl.h>
+
+#include <sys/zfs_context.h>
+
+char hw_serial[11] = "0";
+
+static struct opensolaris_utsname hw_utsname = {
+ .machine = MACHINE
+};
+
+static void
+opensolaris_utsname_init(void *arg)
+{
+
+ hw_utsname.sysname = ostype;
+ hw_utsname.nodename = prison0.pr_hostname;
+ hw_utsname.release = osrelease;
+ snprintf(hw_utsname.version, sizeof (hw_utsname.version),
+ "%d", osreldate);
+}
+
+char *
+kmem_strdup(const char *s)
+{
+ char *buf;
+
+ buf = kmem_alloc(strlen(s) + 1, KM_SLEEP);
+ strcpy(buf, s);
+ return (buf);
+}
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyin(from, to, len));
+}
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyout(from, to, len));
+}
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vpanic(fmt, ap);
+ va_end(ap);
+}
+
+utsname_t *
+utsname(void)
+{
+ return (&hw_utsname);
+}
+SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
+ opensolaris_utsname_init, NULL);
diff --git a/module/os/freebsd/spl/spl_policy.c b/module/os/freebsd/spl/spl_policy.c
new file mode 100644
index 000000000..53732836f
--- /dev/null
+++ b/module/os/freebsd/spl/spl_policy.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/jail.h>
+#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+
+
+int
+secpolicy_nfs(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON));
+}
+
+int
+secpolicy_zfs(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_sys_config(cred_t *cr, int checkonly __unused)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG));
+}
+
+int
+secpolicy_zinject(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT));
+}
+
+int
+secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT));
+}
+
+int
+secpolicy_fs_owner(struct mount *mp, cred_t *cr)
+{
+
+ if (zfs_super_owner) {
+ if (cr->cr_uid == mp->mnt_cred->cr_uid &&
+ cr->cr_prison == mp->mnt_cred->cr_prison) {
+ return (0);
+ }
+ }
+ return (EPERM);
+}
+
+/*
+ * This check is done in kern_link(), so we could just return 0 here.
+ */
+extern int hardlink_check_uid;
+int
+secpolicy_basic_link(vnode_t *vp, cred_t *cr)
+{
+
+ if (!hardlink_check_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_LINK));
+}
+
+int
+secpolicy_vnode_stky_modify(cred_t *cr)
+{
+
+ return (EPERM);
+}
+
+int
+secpolicy_vnode_remove(vnode_t *vp, cred_t *cr)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0)
+ return (EACCES);
+ if ((accmode & VWRITE) &&
+ spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) {
+ return (EACCES);
+ }
+ if (accmode & VEXEC) {
+ if (vp->v_type == VDIR) {
+ if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0)
+ return (EACCES);
+ } else {
+ if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0)
+ return (EACCES);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ */
+int
+secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+ accmode_t curmode, accmode_t wantmode)
+{
+ accmode_t mode;
+
+ mode = ~curmode & wantmode;
+
+ if (mode == 0)
+ return (0);
+
+ return (secpolicy_vnode_access(cr, vp, owner, mode));
+}
+
+int
+secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
+{
+ static int privs[] = {
+ PRIV_VFS_ADMIN,
+ PRIV_VFS_READ,
+ PRIV_VFS_WRITE,
+ PRIV_VFS_EXEC,
+ PRIV_VFS_LOOKUP
+ };
+ int i;
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /* Same as secpolicy_vnode_setdac */
+ if (owner == cr->cr_uid)
+ return (0);
+
+ for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
+ int priv;
+
+ switch (priv = privs[i]) {
+ case PRIV_VFS_EXEC:
+ if (vp->v_type == VDIR)
+ continue;
+ break;
+ case PRIV_VFS_LOOKUP:
+ if (vp->v_type != VDIR)
+ continue;
+ break;
+ }
+ if (spl_priv_check_cred(cr, priv) == 0)
+ return (0);
+ }
+ return (EPERM);
+}
+
+int
+secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+ if (owner == cr->cr_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, int flags,
+ int unlocked_access(void *, int, cred_t *), void *node)
+{
+ int mask = vap->va_mask;
+ int error;
+
+ if (mask & AT_SIZE) {
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ error = unlocked_access(node, VWRITE, cr);
+ if (error)
+ return (error);
+ }
+ if (mask & AT_MODE) {
+ /*
+ * If not the owner of the file then check privilege
+ * for two things: the privilege to set the mode at all
+ * and, if we're setting setuid, we also need permissions
+ * to add the set-uid bit, if we're not the owner.
+ * In the specific case of creating a set-uid root
+ * file, we need even more permissions.
+ */
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+ if (error)
+ return (error);
+ error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
+ if (error)
+ return (error);
+ } else {
+ vap->va_mode = ovap->va_mode;
+ }
+ if (mask & (AT_UID | AT_GID)) {
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+ if (error)
+ return (error);
+
+ /*
+ * To change the owner of a file, or change the group of
+ * a file to a group of which we are not a member, the
+ * caller must have privilege.
+ */
+ if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+ ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
+ !groupmember(vap->va_gid, cr))) {
+ if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
+ error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN);
+ if (error)
+ return (error);
+ }
+ }
+
+ if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+ ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
+ secpolicy_setid_clear(vap, vp, cr);
+ }
+ }
+ if (mask & (AT_ATIME | AT_MTIME)) {
+ /*
+ * From utimes(2):
+ * If times is NULL, ... The caller must be the owner of
+ * the file, have permission to write the file, or be the
+ * super-user.
+ * If times is non-NULL, ... The caller must be the owner of
+ * the file or be the super-user.
+ */
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+ if (error && (vap->va_vaflags & VA_UTIMES_NULL))
+ error = unlocked_access(node, VWRITE, cr);
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+int
+secpolicy_vnode_create_gid(cred_t *cr)
+{
+
+ return (EPERM);
+}
+
+int
+secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
+{
+
+ if (groupmember(gid, cr))
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_SETGID));
+}
+
+int
+secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr,
+ boolean_t issuidroot __unused)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
+}
+
+void
+secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return;
+
+ if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
+ if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) {
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~(S_ISUID|S_ISGID);
+ }
+ }
+}
+
+int
+secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, cred_t *cr)
+{
+ int error;
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /*
+ * Privileged processes may set the sticky bit on non-directories,
+ * as well as set the setgid bit on a file with a group that the process
+ * is not a member of. Both of these are allowed in jail(8).
+ */
+ if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
+ if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE))
+ return (EFTYPE);
+ }
+ /*
+ * Check for privilege if attempting to set the
+ * group-id bit.
+ */
+ if ((vap->va_mode & S_ISGID) != 0) {
+ error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
+ if (error)
+ return (error);
+ }
+ /*
+ * Deny setting setuid if we are not the file owner.
+ */
+ if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
+ error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN);
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+int
+secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+ if (owner == cr->cr_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /* XXX: vfs_suser()? */
+ return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER));
+}
+
+int
+secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN));
+}
+
+void
+secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
+{
+
+ if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) {
+ MNT_ILOCK(vfsp);
+ vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER;
+ vfs_clearmntopt(vfsp, MNTOPT_SETUID);
+ vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
+ MNT_IUNLOCK(vfsp);
+ }
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
+ vtype_t vtype)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+ return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS));
+}
+
+int
+secpolicy_smb(cred_t *cr)
+{
+
+ return (spl_priv_check_cred(cr, PRIV_NETSMB));
+}
diff --git a/module/os/freebsd/spl/spl_procfs_list.c b/module/os/freebsd/spl/spl_procfs_list.c
new file mode 100644
index 000000000..7b4ae9d0e
--- /dev/null
+++ b/module/os/freebsd/spl/spl_procfs_list.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+
+void
+seq_printf(struct seq_file *m, const char *fmt, ...)
+{}
+
+void
+procfs_list_install(const char *module,
+ const char *name,
+ mode_t mode,
+ procfs_list_t *procfs_list,
+ int (*show)(struct seq_file *f, void *p),
+ int (*show_header)(struct seq_file *f),
+ int (*clear)(procfs_list_t *procfs_list),
+ size_t procfs_list_node_off)
+{
+ mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&procfs_list->pl_list,
+ procfs_list_node_off + sizeof (procfs_list_node_t),
+ procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+ procfs_list->pl_next_id = 1;
+ procfs_list->pl_node_offset = procfs_list_node_off;
+}
+
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{}
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+ ASSERT(list_is_empty(&procfs_list->pl_list));
+ list_destroy(&procfs_list->pl_list);
+ mutex_destroy(&procfs_list->pl_lock);
+}
+
+#define NODE_ID(procfs_list, obj) \
+ (((procfs_list_node_t *)(((char *)obj) + \
+ (procfs_list)->pl_node_offset))->pln_id)
+
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+ ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+ NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+ list_insert_tail(&procfs_list->pl_list, p);
+}
diff --git a/module/os/freebsd/spl/spl_string.c b/module/os/freebsd/spl/spl_string.c
new file mode 100644
index 000000000..14d816b5c
--- /dev/null
+++ b/module/os/freebsd/spl/spl_string.c
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/param.h>
+#include <sys/string.h>
+#include <sys/kmem.h>
+#include <machine/stdarg.h>
+
+#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9')
+
+#define IS_ALPHA(c) \
+ (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+char *
+strpbrk(const char *s, const char *b)
+{
+ const char *p;
+
+ do {
+ for (p = b; *p != '\0' && *p != *s; ++p)
+ ;
+ if (*p != '\0')
+ return ((char *)s);
+ } while (*s++);
+
+ return (NULL);
+}
+
+/*
+ * Convert a string into a valid C identifier by replacing invalid
+ * characters with '_'. Also makes sure the string is nul-terminated
+ * and takes up at most n bytes.
+ */
+void
+strident_canon(char *s, size_t n)
+{
+ char c;
+ char *end = s + n - 1;
+
+ if ((c = *s) == 0)
+ return;
+
+ if (!IS_ALPHA(c) && c != '_')
+ *s = '_';
+
+ while (s < end && ((c = *(++s)) != 0)) {
+ if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_')
+ *s = '_';
+ }
+ *s = 0;
+}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ char *buf;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx) + 1;
+ va_end(adx);
+
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, size, fmt, adx);
+ va_end(adx);
+
+ return (buf);
+}
+
+void
+kmem_strfree(char *str)
+{
+ ASSERT(str != NULL);
+ kmem_free(str, strlen(str) + 1);
+}
diff --git a/module/os/freebsd/spl/spl_sunddi.c b/module/os/freebsd/spl/spl_sunddi.c
new file mode 100644
index 000000000..1fa4f56f1
--- /dev/null
+++ b/module/os/freebsd/spl/spl_sunddi.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sunddi.h>
+#include <sys/sysctl.h>
+
+int
+ddi_strtol(const char *str, char **nptr, int base, long *result)
+{
+
+ *result = strtol(str, nptr, base);
+ return (0);
+}
+
+int
+ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
+{
+
+ if (str == hw_serial) {
+ *result = prison0.pr_hostid;
+ return (0);
+ }
+
+ *result = strtoul(str, nptr, base);
+ return (0);
+}
+
+int
+ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
+{
+
+ *result = (unsigned long long)strtouq(str, nptr, base);
+ return (0);
+}
+
+int
+ddi_strtoll(const char *str, char **nptr, int base, long long *result)
+{
+
+ *result = (long long)strtoq(str, nptr, base);
+ return (0);
+}
diff --git a/module/os/freebsd/spl/spl_sysevent.c b/module/os/freebsd/spl/spl_sysevent.c
new file mode 100644
index 000000000..d3748276a
--- /dev/null
+++ b/module/os/freebsd/spl/spl_sysevent.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <[email protected]>
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/sbuf.h>
+#include <sys/nvpair.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/bus.h>
+
+static int
+log_sysevent(nvlist_t *event)
+{
+ struct sbuf *sb;
+ const char *type;
+ char typestr[128];
+ nvpair_t *elem = NULL;
+
+ sb = sbuf_new_auto();
+ if (sb == NULL)
+ return (ENOMEM);
+ type = NULL;
+
+ while ((elem = nvlist_next_nvpair(event, elem)) != NULL) {
+ switch (nvpair_type(elem)) {
+ case DATA_TYPE_BOOLEAN:
+ {
+ boolean_t value;
+
+ (void) nvpair_value_boolean_value(elem, &value);
+ sbuf_printf(sb, " %s=%s", nvpair_name(elem),
+ value ? "true" : "false");
+ break;
+ }
+ case DATA_TYPE_UINT8:
+ {
+ uint8_t value;
+
+ (void) nvpair_value_uint8(elem, &value);
+ sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
+ break;
+ }
+ case DATA_TYPE_INT32:
+ {
+ int32_t value;
+
+ (void) nvpair_value_int32(elem, &value);
+ sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+ (intmax_t)value);
+ break;
+ }
+ case DATA_TYPE_UINT32:
+ {
+ uint32_t value;
+
+ (void) nvpair_value_uint32(elem, &value);
+ sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+ (uintmax_t)value);
+ break;
+ }
+ case DATA_TYPE_INT64:
+ {
+ int64_t value;
+
+ (void) nvpair_value_int64(elem, &value);
+ sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+ (intmax_t)value);
+ break;
+ }
+ case DATA_TYPE_UINT64:
+ {
+ uint64_t value;
+
+ (void) nvpair_value_uint64(elem, &value);
+ sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+ (uintmax_t)value);
+ break;
+ }
+ case DATA_TYPE_STRING:
+ {
+ char *value;
+
+ (void) nvpair_value_string(elem, &value);
+ sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
+ if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
+ type = value;
+ break;
+ }
+ case DATA_TYPE_UINT8_ARRAY:
+ {
+ uint8_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint8_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%02hhx", value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT16_ARRAY:
+ {
+ uint16_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint16_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%04hx", value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT32_ARRAY:
+ {
+ uint32_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint32_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
+ break;
+ }
+ case DATA_TYPE_INT64_ARRAY:
+ {
+ int64_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_int64_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%016lld",
+ (long long)value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT64_ARRAY:
+ {
+ uint64_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint64_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
+ break;
+ }
+ case DATA_TYPE_STRING_ARRAY:
+ {
+ char **strarr;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_string_array(elem, &strarr, &nelem);
+
+ for (ii = 0; ii < nelem; ii++) {
+ if (strarr[ii] == NULL) {
+ sbuf_printf(sb, " <NULL>");
+ continue;
+ }
+
+ sbuf_printf(sb, " %s", strarr[ii]);
+ if (strcmp(FM_CLASS, strarr[ii]) == 0)
+ type = strarr[ii];
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST:
+ /* XXX - requires recursing in log_sysevent */
+ break;
+ default:
+ printf("%s: type %d is not implemented\n", __func__,
+ nvpair_type(elem));
+ break;
+ }
+ }
+
+ if (sbuf_finish(sb) != 0) {
+ sbuf_delete(sb);
+ return (ENOMEM);
+ }
+
+ if (type == NULL)
+ type = "";
+ if (strncmp(type, "ESC_ZFS_", 8) == 0) {
+ snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8);
+ type = typestr;
+ }
+ devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
+ sbuf_delete(sb);
+
+ return (0);
+}
+
+static void
+sysevent_worker(void *arg __unused)
+{
+ zfs_zevent_t *ze;
+ nvlist_t *event;
+ uint64_t dropped = 0;
+ uint64_t dst_size;
+ int error;
+
+ zfs_zevent_init(&ze);
+ for (;;) {
+ dst_size = 131072;
+ dropped = 0;
+ event = NULL;
+ error = zfs_zevent_next(ze, &event,
+ &dst_size, &dropped);
+ if (error) {
+ error = zfs_zevent_wait(ze);
+ if (error == ESHUTDOWN)
+ break;
+ } else {
+ VERIFY(event != NULL);
+ log_sysevent(event);
+ nvlist_free(event);
+ }
+ }
+ zfs_zevent_destroy(ze);
+ kthread_exit();
+}
+
+void
+ddi_sysevent_init(void)
+{
+ kproc_kthread_add(sysevent_worker, NULL, &zfsproc, NULL, 0, 0,
+ "zfskern", "sysevent");
+}
diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c
new file mode 100644
index 000000000..b6a501f67
--- /dev/null
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2009 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/taskq.h>
+#include <sys/zfs_context.h>
+
+#include <vm/uma.h>
+
+static uint_t taskq_tsd;
+static uma_zone_t taskq_zone;
+
+taskq_t *system_taskq = NULL;
+taskq_t *system_delay_taskq = NULL;
+taskq_t *dynamic_taskq = NULL;
+
+extern int uma_align_cache;
+
+#define TQ_MASK uma_align_cache
+#define TQ_PTR_MASK ~uma_align_cache
+
+#define TIMEOUT_TASK 1
+#define NORMAL_TASK 2
+
+static int
+taskqent_init(void *mem, int size, int flags)
+{
+ bzero(mem, sizeof (taskq_ent_t));
+ return (0);
+}
+
+static int
+taskqent_ctor(void *mem, int size, void *arg, int flags)
+{
+ return (0);
+}
+
+static void
+taskqent_dtor(void *mem, int size, void *arg)
+{
+ taskq_ent_t *ent = mem;
+
+ ent->tqent_gen = (ent->tqent_gen + 1) & TQ_MASK;
+}
+
+static void
+system_taskq_init(void *arg)
+{
+
+ tsd_create(&taskq_tsd, NULL);
+ taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t),
+ taskqent_ctor, taskqent_dtor, taskqent_init, NULL,
+ UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
+ system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri,
+ 0, 0, 0);
+ system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus,
+ minclsyspri, 0, 0, 0);
+}
+SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init,
+ NULL);
+
+static void
+system_taskq_fini(void *arg)
+{
+
+ taskq_destroy(system_taskq);
+ uma_zdestroy(taskq_zone);
+ tsd_destroy(&taskq_tsd);
+}
+SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini,
+ NULL);
+
+static void
+taskq_tsd_set(void *context)
+{
+ taskq_t *tq = context;
+
+ tsd_set(taskq_tsd, tq);
+}
+
+static taskq_t *
+taskq_create_with_init(const char *name, int nthreads, pri_t pri,
+ int minalloc __unused, int maxalloc __unused, uint_t flags)
+{
+ taskq_t *tq;
+
+ if ((flags & TASKQ_THREADS_CPU_PCT) != 0)
+ nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
+
+ tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+ tq->tq_queue = taskqueue_create(name, M_WAITOK,
+ taskqueue_thread_enqueue, &tq->tq_queue);
+ taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
+ taskq_tsd_set, tq);
+ taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN,
+ taskq_tsd_set, NULL);
+ (void) taskqueue_start_threads(&tq->tq_queue, nthreads, pri,
+ "%s", name);
+
+ return ((taskq_t *)tq);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused,
+ int maxalloc __unused, uint_t flags)
+{
+
+ return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc,
+ flags));
+}
+
+taskq_t *
+taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc,
+ int maxalloc, proc_t *proc __unused, uint_t flags)
+{
+
+ return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc,
+ flags));
+}
+
+void
+taskq_destroy(taskq_t *tq)
+{
+
+ taskqueue_free(tq->tq_queue);
+ kmem_free(tq, sizeof (*tq));
+}
+
+int
+taskq_member(taskq_t *tq, kthread_t *thread)
+{
+
+ return (taskqueue_member(tq->tq_queue, thread));
+}
+
+taskq_t *
+taskq_of_curthread(void)
+{
+ return (tsd_get(taskq_tsd));
+}
+
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t tid)
+{
+ uint32_t pend;
+ int rc;
+ taskq_ent_t *ent = (void*)(tid & TQ_PTR_MASK);
+
+ if (ent == NULL)
+ return (0);
+ if ((tid & TQ_MASK) != ent->tqent_gen)
+ return (0);
+ if (ent->tqent_type == TIMEOUT_TASK) {
+ rc = taskqueue_cancel_timeout(tq->tq_queue,
+ &ent->tqent_timeout_task, &pend);
+ } else
+ rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
+ if (rc == EBUSY)
+ taskq_wait_id(tq, tid);
+ else
+ uma_zfree(taskq_zone, ent);
+ return (rc);
+}
+
+static void
+taskq_run(void *arg, int pending __unused)
+{
+ taskq_ent_t *task = arg;
+
+ task->tqent_func(task->tqent_arg);
+ uma_zfree(taskq_zone, task);
+}
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+ uint_t flags, clock_t expire_time)
+{
+ taskq_ent_t *task;
+ taskqid_t tid;
+ clock_t timo;
+ int mflag;
+
+ timo = expire_time - ddi_get_lbolt();
+ if (timo <= 0)
+ return (taskq_dispatch(tq, func, arg, flags));
+
+ if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+ mflag = M_WAITOK;
+ else
+ mflag = M_NOWAIT;
+
+ task = uma_zalloc(taskq_zone, mflag);
+ if (task == NULL)
+ return (0);
+ tid = (uintptr_t)task;
+ MPASS((tid & TQ_MASK) == 0);
+ task->tqent_func = func;
+ task->tqent_arg = arg;
+ task->tqent_type = TIMEOUT_TASK;
+ tid |= task->tqent_gen;
+ TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
+ taskq_run, task);
+
+ taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task,
+ timo);
+ return (tid);
+}
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *task;
+ int mflag, prio;
+ taskqid_t tid;
+
+ if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+ mflag = M_WAITOK;
+ else
+ mflag = M_NOWAIT;
+ /*
+ * If TQ_FRONT is given, we want higher priority for this task, so it
+ * can go at the front of the queue.
+ */
+ prio = !!(flags & TQ_FRONT);
+
+ task = uma_zalloc(taskq_zone, mflag);
+ if (task == NULL)
+ return (0);
+
+ tid = (uintptr_t)task;
+ MPASS((tid & TQ_MASK) == 0);
+ task->tqent_func = func;
+ task->tqent_arg = arg;
+ task->tqent_type = NORMAL_TASK;
+ TASK_INIT(&task->tqent_task, prio, taskq_run, task);
+ tid |= task->tqent_gen;
+ taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+ return (tid);
+}
+
+static void
+taskq_run_ent(void *arg, int pending __unused)
+{
+ taskq_ent_t *task = arg;
+
+ task->tqent_func(task->tqent_arg);
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
+ taskq_ent_t *task)
+{
+ int prio;
+
+ /*
+ * If TQ_FRONT is given, we want higher priority for this task, so it
+ * can go at the front of the queue.
+ */
+ prio = !!(flags & TQ_FRONT);
+
+ task->tqent_func = func;
+ task->tqent_arg = arg;
+
+ TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task);
+ taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+}
+
+void
+taskq_wait(taskq_t *tq)
+{
+ taskqueue_quiesce(tq->tq_queue);
+}
+
+void
+taskq_wait_id(taskq_t *tq, taskqid_t tid)
+{
+ taskq_ent_t *ent = (void*)(tid & TQ_PTR_MASK);
+
+ if ((tid & TQ_MASK) != ent->tqent_gen)
+ return;
+
+ taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+}
+
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused)
+{
+ taskqueue_drain_all(tq->tq_queue);
+}
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+ return (t->tqent_task.ta_pending == 0);
+}
diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c
new file mode 100644
index 000000000..05dbfd06d
--- /dev/null
+++ b/module/os/freebsd/spl/spl_uio.c
@@ -0,0 +1,92 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+/*
+ * same as uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
+{
+ struct iovec small_iovec[1];
+ struct uio small_uio_clone;
+ struct uio *uio_clone;
+ int error;
+
+ ASSERT3U(uio->uio_rw, ==, rw);
+ if (uio->uio_iovcnt == 1) {
+ small_uio_clone = *uio;
+ small_iovec[0] = *uio->uio_iov;
+ small_uio_clone.uio_iov = small_iovec;
+ uio_clone = &small_uio_clone;
+ } else {
+ uio_clone = cloneuio(uio);
+ }
+
+ error = vn_io_fault_uiomove(p, n, uio_clone);
+ *cbytes = uio->uio_resid - uio_clone->uio_resid;
+ if (uio_clone != &small_uio_clone)
+ free(uio_clone, M_IOV);
+ return (error);
+}
+
+/*
+ * Drop the next n chars out of *uiop.
+ */
+void
+uioskip(uio_t *uio, size_t n)
+{
+ enum uio_seg segflg;
+
+ /* For the full compatibility with illumos. */
+ if (n > uio->uio_resid)
+ return;
+
+ segflg = uio->uio_segflg;
+ uio->uio_segflg = UIO_NOCOPY;
+ uiomove(NULL, n, uio->uio_rw, uio);
+ uio->uio_segflg = segflg;
+}
diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c
new file mode 100644
index 000000000..99da8c976
--- /dev/null
+++ b/module/os/freebsd/spl/spl_vfs.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cred.h>
+#include <sys/vfs.h>
+#include <sys/priv.h>
+#include <sys/libkern.h>
+
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+
+MALLOC_DECLARE(M_MOUNT);
+
+void
+vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
+ int flags __unused)
+{
+ struct vfsopt *opt;
+ size_t namesize;
+ int locked;
+
+ if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+ MNT_ILOCK(vfsp);
+
+ if (vfsp->mnt_opt == NULL) {
+ void *opts;
+
+ MNT_IUNLOCK(vfsp);
+ opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
+ MNT_ILOCK(vfsp);
+ if (vfsp->mnt_opt == NULL) {
+ vfsp->mnt_opt = opts;
+ TAILQ_INIT(vfsp->mnt_opt);
+ } else {
+ free(opts, M_MOUNT);
+ }
+ }
+
+ MNT_IUNLOCK(vfsp);
+
+ opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK);
+ namesize = strlen(name) + 1;
+ opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
+ strlcpy(opt->name, name, namesize);
+ opt->pos = -1;
+ opt->seen = 1;
+ if (arg == NULL) {
+ opt->value = NULL;
+ opt->len = 0;
+ } else {
+ opt->len = strlen(arg) + 1;
+ opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+ bcopy(arg, opt->value, opt->len);
+ }
+
+ MNT_ILOCK(vfsp);
+ TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
+ if (!locked)
+ MNT_IUNLOCK(vfsp);
+}
+
+void
+vfs_clearmntopt(vfs_t *vfsp, const char *name)
+{
+ int locked;
+
+ if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+ MNT_ILOCK(vfsp);
+ vfs_deleteopt(vfsp->mnt_opt, name);
+ if (!locked)
+ MNT_IUNLOCK(vfsp);
+}
+
+int
+vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
+{
+ struct vfsoptlist *opts = vfsp->mnt_optnew;
+ int error;
+
+ if (opts == NULL)
+ return (0);
+ error = vfs_getopt(opts, opt, (void **)argp, NULL);
+ return (error != 0 ? 0 : 1);
+}
+
+int
+mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
+ char *fspec, int fsflags)
+{
+ struct vfsconf *vfsp;
+ struct mount *mp;
+ vnode_t *vp, *mvp;
+ struct ucred *cr;
+ int error;
+
+ ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
+
+ vp = *vpp;
+ *vpp = NULL;
+ error = 0;
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+ error = ENAMETOOLONG;
+ if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
+ error = ENODEV;
+ if (error == 0 && vp->v_type != VDIR)
+ error = ENOTDIR;
+ /*
+ * We need vnode lock to protect v_mountedhere and vnode interlock
+ * to protect v_iflag.
+ */
+ if (error == 0) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+ vp->v_iflag |= VI_MOUNT;
+ else
+ error = EBUSY;
+ VI_UNLOCK(vp);
+ }
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK1(vp);
+
+ /*
+ * Allocate and initialize the filesystem.
+ * We don't want regular user that triggered snapshot mount to be able
+ * to unmount it, so pass credentials of the parent mount.
+ */
+ mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
+
+ mp->mnt_optnew = NULL;
+ vfs_setmntopt(mp, "from", fspec, 0);
+ mp->mnt_optnew = mp->mnt_opt;
+ mp->mnt_opt = NULL;
+
+ /*
+ * Set the mount level flags.
+ */
+ mp->mnt_flag = fsflags & MNT_UPDATEMASK;
+ /*
+ * Snapshots are always read-only.
+ */
+ mp->mnt_flag |= MNT_RDONLY;
+ /*
+ * We don't want snapshots to allow access to vulnerable setuid
+ * programs, so we turn off setuid when mounting snapshots.
+ */
+ mp->mnt_flag |= MNT_NOSUID;
+ /*
+ * We don't want snapshots to be visible in regular
+ * mount(8) and df(1) output.
+ */
+ mp->mnt_flag |= MNT_IGNORE;
+ /*
+ * XXX: This is evil, but we can't mount a snapshot as a regular user.
+ * XXX: Is is safe when snapshot is mounted from within a jail?
+ */
+ cr = td->td_ucred;
+ td->td_ucred = kcred;
+ error = VFS_MOUNT(mp);
+ td->td_ucred = cr;
+
+ if (error != 0) {
+ /*
+ * Clear VI_MOUNT and decrement the use count "atomically",
+ * under the vnode lock. This is not strictly required,
+ * but makes it easier to reason about the life-cycle and
+ * ownership of the covered vnode.
+ */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vput(vp);
+ vfs_unbusy(mp);
+ vfs_freeopts(mp->mnt_optnew);
+ mp->mnt_vnodecovered = NULL;
+ vfs_mount_destroy(mp);
+ return (error);
+ }
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ (void) VFS_STATFS(mp, &mp->mnt_stat);
+
+ /*
+ * Prevent external consumers of mount options from reading
+ * mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef FREEBSD_NAMECACHE
+ cache_purge(vp);
+#endif
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+
+ vp->v_mountedhere = mp;
+ /* Put the new filesystem on the mount list. */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ vfs_event_signal(NULL, VQ_MOUNT, 0);
+ if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
+ panic("mount: lost mount");
+ VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300048
+ vfs_op_exit(mp);
+#endif
+ vfs_unbusy(mp);
+ *vpp = mvp;
+ return (0);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+ VERIFY(vp->v_count > 0);
+ if (refcount_release_if_not_last(&vp->v_usecount)) {
+#if __FreeBSD_version < 1300045
+ vdrop(vp);
+#endif
+ return;
+ }
+ VERIFY(taskq_dispatch((taskq_t *)taskq,
+ (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
+}
diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c
new file mode 100644
index 000000000..cd18ebb7a
--- /dev/null
+++ b/module/os/freebsd/spl/spl_vm.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/lock.h>
+#include <sys/freebsd_rwlock.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
+const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
+const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
+const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
+
+void
+zfs_vmobject_assert_wlocked(vm_object_t object)
+{
+
+ /*
+ * This is not ideal because FILE/LINE used by assertions will not
+ * be too helpful, but it must be an hard function for
+ * compatibility reasons.
+ */
+ VM_OBJECT_ASSERT_WLOCKED(object);
+}
+
+void
+zfs_vmobject_wlock(vm_object_t object)
+{
+
+ VM_OBJECT_WLOCK(object);
+}
+
+void
+zfs_vmobject_wunlock(vm_object_t object)
+{
+
+ VM_OBJECT_WUNLOCK(object);
+}
diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c
new file mode 100644
index 000000000..7549483d8
--- /dev/null
+++ b/module/os/freebsd/spl/spl_zlib.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+#if __FreeBSD_version >= 1300041
+#include <contrib/zlib/zlib.h>
+#else
+#include <sys/zlib.h>
+#endif
+#include <sys/kobj.h>
+
+
+/*ARGSUSED*/
+static void *
+zcalloc(void *opaque, uint_t items, uint_t size)
+{
+
+ return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT));
+}
+
+/*ARGSUSED*/
+static void
+zcfree(void *opaque, void *ptr)
+{
+
+ free(ptr, M_SOLARIS);
+}
+
+static int
+zlib_deflateInit(z_stream *stream, int level)
+{
+
+ stream->zalloc = zcalloc;
+ stream->opaque = NULL;
+ stream->zfree = zcfree;
+
+ return (deflateInit(stream, level));
+}
+
+static int
+zlib_deflate(z_stream *stream, int flush)
+{
+ return (deflate(stream, flush));
+}
+
+static int
+zlib_deflateEnd(z_stream *stream)
+{
+ return (deflateEnd(stream));
+}
+
+static int
+zlib_inflateInit(z_stream *stream)
+{
+ stream->zalloc = zcalloc;
+ stream->opaque = NULL;
+ stream->zfree = zcfree;
+
+ return (inflateInit(stream));
+}
+
+static int
+zlib_inflate(z_stream *stream, int finish)
+{
+#if __FreeBSD_version >= 1300024
+ return (inflate(stream, finish));
+#else
+ return (_zlib104_inflate(stream, finish));
+#endif
+}
+
+
+static int
+zlib_inflateEnd(z_stream *stream)
+{
+ return (inflateInit(stream));
+}
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call. Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use. This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+ // return (kmem_cache_alloc(zlib_workspace_cache, flags));
+ return (NULL);
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+ // kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit. sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level)
+{
+ z_stream stream;
+ int err;
+
+ bzero(&stream, sizeof (stream));
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+ stream.opaque = NULL;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+ if (!stream.opaque)
+ return (Z_MEM_ERROR);
+#endif
+ err = zlib_deflateInit(&stream, level);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.opaque);
+ return (err);
+ }
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+ return (err);
+}
+
+/*
+ * Decompresses the source buffer into the destination buffer. sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+ z_stream stream;
+ int err;
+
+ bzero(&stream, sizeof (stream));
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+ if (!stream.opaque)
+ return (Z_MEM_ERROR);
+#endif
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.opaque);
+ return (err);
+ }
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+
+ if (err == Z_NEED_DICT ||
+ (err == Z_BUF_ERROR && stream.avail_in == 0))
+ return (Z_DATA_ERROR);
+
+ return (err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.opaque);
+
+ return (err);
+}
+
+#if 0
+int
+spl_zlib_init(void)
+{
+ int size;
+
+ size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+
+ zlib_workspace_cache = kmem_cache_create(
+ "spl_zlib_workspace_cache",
+ size, 0, NULL, NULL, NULL, NULL, NULL,
+ KMC_VMEM | KMC_NOEMERGENCY);
+ if (!zlib_workspace_cache)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+ kmem_cache_destroy(zlib_workspace_cache);
+ zlib_workspace_cache = NULL;
+}
+#endif
diff --git a/module/os/freebsd/spl/spl_zone.c b/module/os/freebsd/spl/spl_zone.c
new file mode 100644
index 000000000..40f21934e
--- /dev/null
+++ b/module/os/freebsd/spl/spl_zone.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/jail.h>
+#include <sys/osd.h>
+#include <sys/priv.h>
+#include <sys/zone.h>
+
+#include <sys/policy.h>
+
+static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data");
+
+/*
+ * Structure to record list of ZFS datasets exported to a zone.
+ */
+typedef struct zone_dataset {
+ LIST_ENTRY(zone_dataset) zd_next;
+ char zd_dataset[0];
+} zone_dataset_t;
+
+LIST_HEAD(zone_dataset_head, zone_dataset);
+
+static int zone_slot;
+
+int
+zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd, *zd2;
+ struct prison *pr;
+ int dofree, error;
+
+ if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+ return (error);
+
+ /* Allocate memory before we grab prison's mutex. */
+ zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK);
+
+ sx_slock(&allprison_lock);
+ pr = prison_find(jailid); /* Locks &pr->pr_mtx. */
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL) {
+ free(zd, M_ZONES);
+ return (ENOENT);
+ }
+
+ head = osd_jail_get(pr, zone_slot);
+ if (head != NULL) {
+ dofree = 0;
+ LIST_FOREACH(zd2, head, zd_next) {
+ if (strcmp(dataset, zd2->zd_dataset) == 0) {
+ free(zd, M_ZONES);
+ error = EEXIST;
+ goto end;
+ }
+ }
+ } else {
+ dofree = 1;
+ prison_hold_locked(pr);
+ mtx_unlock(&pr->pr_mtx);
+ head = malloc(sizeof (*head), M_ZONES, M_WAITOK);
+ LIST_INIT(head);
+ mtx_lock(&pr->pr_mtx);
+ error = osd_jail_set(pr, zone_slot, head);
+ KASSERT(error == 0, ("osd_jail_set() failed (error=%d)",
+ error));
+ }
+ strcpy(zd->zd_dataset, dataset);
+ LIST_INSERT_HEAD(head, zd, zd_next);
+end:
+ if (dofree)
+ prison_free_locked(pr);
+ else
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+
+int
+zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd;
+ struct prison *pr;
+ int error;
+
+ if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+ return (error);
+
+ sx_slock(&allprison_lock);
+ pr = prison_find(jailid);
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL)
+ return (ENOENT);
+ head = osd_jail_get(pr, zone_slot);
+ if (head == NULL) {
+ error = ENOENT;
+ goto end;
+ }
+ LIST_FOREACH(zd, head, zd_next) {
+ if (strcmp(dataset, zd->zd_dataset) == 0)
+ break;
+ }
+ if (zd == NULL)
+ error = ENOENT;
+ else {
+ LIST_REMOVE(zd, zd_next);
+ free(zd, M_ZONES);
+ if (LIST_EMPTY(head))
+ osd_jail_del(pr, zone_slot);
+ error = 0;
+ }
+end:
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+
+/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd;
+ struct prison *pr;
+ size_t len;
+ int ret = 0;
+
+ if (dataset[0] == '\0')
+ return (0);
+ if (INGLOBALZONE(curproc)) {
+ if (write != NULL)
+ *write = 1;
+ return (1);
+ }
+ pr = curthread->td_ucred->cr_prison;
+ mtx_lock(&pr->pr_mtx);
+ head = osd_jail_get(pr, zone_slot);
+ if (head == NULL)
+ goto end;
+
+ /*
+ * Walk the list once, looking for datasets which match exactly, or
+ * specify a dataset underneath an exported dataset. If found, return
+ * true and note that it is writable.
+ */
+ LIST_FOREACH(zd, head, zd_next) {
+ len = strlen(zd->zd_dataset);
+ if (strlen(dataset) >= len &&
+ bcmp(dataset, zd->zd_dataset, len) == 0 &&
+ (dataset[len] == '\0' || dataset[len] == '/' ||
+ dataset[len] == '@')) {
+ if (write)
+ *write = 1;
+ ret = 1;
+ goto end;
+ }
+ }
+
+ /*
+ * Walk the list a second time, searching for datasets which are parents
+ * of exported datasets. These should be visible, but read-only.
+ *
+ * Note that we also have to support forms such as 'pool/dataset/', with
+ * a trailing slash.
+ */
+ LIST_FOREACH(zd, head, zd_next) {
+ len = strlen(dataset);
+ if (dataset[len - 1] == '/')
+ len--; /* Ignore trailing slash */
+ if (len < strlen(zd->zd_dataset) &&
+ bcmp(dataset, zd->zd_dataset, len) == 0 &&
+ zd->zd_dataset[len] == '/') {
+ if (write)
+ *write = 0;
+ ret = 1;
+ goto end;
+ }
+ }
+end:
+ mtx_unlock(&pr->pr_mtx);
+ return (ret);
+}
+
+static void
+zone_destroy(void *arg)
+{
+ struct zone_dataset_head *head;
+ zone_dataset_t *zd;
+
+ head = arg;
+ while ((zd = LIST_FIRST(head)) != NULL) {
+ LIST_REMOVE(zd, zd_next);
+ free(zd, M_ZONES);
+ }
+ free(head, M_ZONES);
+}
+
+uint32_t
+zone_get_hostid(void *ptr)
+{
+
+ KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
+
+ return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
+}
+
+boolean_t
+in_globalzone(struct proc *p)
+{
+ return (!jailed(FIRST_THREAD_IN_PROC((p))->td_ucred));
+}
+
+static void
+zone_sysinit(void *arg __unused)
+{
+
+ zone_slot = osd_jail_register(zone_destroy, NULL);
+}
+
+static void
+zone_sysuninit(void *arg __unused)
+{
+
+ osd_jail_deregister(zone_slot);
+}
+
+SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
+SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL);
diff --git a/module/os/freebsd/zfs/abd.c b/module/os/freebsd/zfs/abd.c
new file mode 100644
index 000000000..888a113a4
--- /dev/null
+++ b/module/os/freebsd/zfs/abd.c
@@ -0,0 +1,1134 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ * +-------------------+
+ * | ABD (linear) |
+ * | abd_flags = ... |
+ * | abd_size = ... | +--------------------------------+
+ * | abd_buf ------------->| raw buffer of size abd_size |
+ * +-------------------+ +--------------------------------+
+ * no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ * to the chunks recorded in an array at the end of the ABD structure.
+ *
+ * +-------------------+
+ * | ABD (scattered) |
+ * | abd_flags = ... |
+ * | abd_size = ... |
+ * | abd_offset = 0 | +-----------+
+ * | abd_chunks[0] ----------------------------->| chunk 0 |
+ * | abd_chunks[1] ---------------------+ +-----------+
+ * | ... | | +-----------+
+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
+ * +-------------------+ | +-----------+
+ * | ...
+ * | +-----------+
+ * +----------------->| chunk N-1 |
+ * +-----------+
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ */
+
+#include <sys/abd.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+};
+
+#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
+#define ABDSTAT_INCR(stat, val) \
+ atomic_add_64(&abd_stats.stat.value.ui64, (val))
+#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
+#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
+
+/*
+ * It is possible to make all future ABDs be linear by setting this to B_FALSE.
+ * Otherwise, ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear().
+ */
+boolean_t zfs_abd_scatter_enabled = B_TRUE;
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#if defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
+ &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
+ &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+extern inline boolean_t abd_is_linear(abd_t *abd);
+extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size);
+extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size);
+extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
+extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size);
+extern inline void abd_zero(abd_t *abd, size_t size);
+
+static void *
+abd_alloc_chunk()
+{
+ void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+ ASSERT3P(c, !=, NULL);
+ return (c);
+}
+
+static void
+abd_free_chunk(void *c)
+{
+ kmem_cache_free(abd_chunk_cache, c);
+}
+
+void
+abd_init(void)
+{
+ abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
+
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+ }
+}
+
+void
+abd_fini(void)
+{
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ kmem_cache_destroy(abd_chunk_cache);
+ abd_chunk_cache = NULL;
+}
+
+static inline size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+ ASSERT(!abd_is_linear(abd));
+ return (abd_chunkcnt_for_bytes(
+ abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
+}
+
+static inline void
+abd_verify(abd_t *abd)
+{
+ ASSERT3U(abd->abd_size, >, 0);
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+ ABD_FLAG_OWNER | ABD_FLAG_META));
+ IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+ IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
+ } else {
+ ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
+ zfs_abd_chunk_size);
+ size_t n = abd_scatter_chunkcnt(abd);
+ for (int i = 0; i < n; i++) {
+ ASSERT3P(
+ abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
+ }
+ }
+}
+
+static inline abd_t *
+abd_alloc_struct(size_t chunkcnt)
+{
+ size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+ abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, size);
+
+ return (abd);
+}
+
+static inline void
+abd_free_struct(abd_t *abd)
+{
+ size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
+ int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+ kmem_free(abd, size);
+ ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+ if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size)
+ return (abd_alloc_linear(size, is_metadata));
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ size_t n = abd_chunkcnt_for_bytes(size);
+ abd_t *abd = abd_alloc_struct(n);
+
+ abd->abd_flags = ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_scatter.abd_offset = 0;
+ abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+ for (int i = 0; i < n; i++) {
+ void *c = abd_alloc_chunk();
+ ASSERT3P(c, !=, NULL);
+ abd->abd_u.abd_scatter.abd_chunks[i] = c;
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ n * zfs_abd_chunk_size - size);
+
+ return (abd);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+ size_t n = abd_scatter_chunkcnt(abd);
+ for (int i = 0; i < n; i++) {
+ abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ abd->abd_size - n * zfs_abd_chunk_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ if (is_metadata) {
+ abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
+ } else {
+ abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, size);
+
+ return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+ if (abd->abd_flags & ABD_FLAG_META) {
+ zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ } else {
+ zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
+ * abd_alloc_linear().
+ */
+void
+abd_free(abd_t *abd)
+{
+ if (abd == NULL)
+ return;
+
+ abd_verify(abd);
+ ASSERT3P(abd->abd_parent, ==, NULL);
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd))
+ abd_free_linear(abd);
+ else
+ abd_free_scatter(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+ boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+ if (abd_is_linear(sabd)) {
+ return (abd_alloc_linear(size, is_metadata));
+ } else {
+ return (abd_alloc(size, is_metadata));
+ }
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc_linear(size, is_metadata));
+}
+
+/*
+ * Allocate a new ABD to point to offset off of sabd. It shares the underlying
+ * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
+ * any derived ABDs exist.
+ */
+/* ARGSUSED */
+static inline abd_t *
+abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
+{
+ abd_t *abd;
+
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ if (abd_is_linear(sabd)) {
+ abd = abd_alloc_struct(0);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+
+ abd->abd_u.abd_linear.abd_buf =
+ (char *)sabd->abd_u.abd_linear.abd_buf + off;
+ } else {
+ size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
+ size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+ (new_offset / zfs_abd_chunk_size);
+
+ abd = abd_alloc_struct(chunkcnt);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = 0;
+
+ abd->abd_u.abd_scatter.abd_offset =
+ new_offset % zfs_abd_chunk_size;
+ abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+ /* Copy the scatterlist starting at the correct offset */
+ (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
+ &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
+ zfs_abd_chunk_size],
+ chunkcnt * sizeof (void *));
+ }
+
+ if (size == 0)
+ abd->abd_size = sabd->abd_size - off;
+ else
+ abd->abd_size = size;
+ abd->abd_parent = sabd;
+ zfs_refcount_create(&abd->abd_children);
+ (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+
+ return (abd);
+}
+
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+
+ return (abd_get_offset_impl(sabd, off, 0));
+}
+
+abd_t *
+abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
+{
+ ASSERT3U(off + size, <=, sabd->abd_size);
+
+ return (abd_get_offset_impl(sabd, off, size));
+}
+
+
+/*
+ * Allocate a linear ABD structure for buf. You must free this with abd_put()
+ * since the resulting ABD doesn't own its own buffer.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that if we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_linear.abd_buf = buf;
+
+ return (abd);
+}
+
+/*
+ * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
+ * free the underlying scatterlist or buffer.
+ */
+void
+abd_put(abd_t *abd)
+{
+ if (abd == NULL)
+ return;
+ abd_verify(abd);
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+
+ if (abd->abd_parent != NULL) {
+ (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
+ abd->abd_size, abd);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ abd_free_struct(abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ abd_verify(abd);
+ return (abd->abd_u.abd_linear.abd_buf);
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+ (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+ if (!abd_is_linear(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+ (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+ abd_verify(abd);
+
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ abd_verify(abd);
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ /* Disable this flag since we no longer own the data buffer */
+ abd->abd_flags &= ~ABD_FLAG_META;
+
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+}
+
+struct abd_iter {
+ abd_t *iter_abd; /* ABD being iterated through */
+ size_t iter_pos; /* position (relative to abd_offset) */
+ void *iter_mapaddr; /* addr corresponding to iter_pos */
+ size_t iter_mapsize; /* length of data valid at mapaddr */
+};
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+ aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+ aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+static void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_pos = 0;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+static void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* Panic if someone has changed zfs_abd_chunk_size */
+ IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+ aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ offset = aiter->iter_pos;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
+ } else {
+ size_t index = abd_iter_scatter_chunk_index(aiter);
+ offset = abd_iter_scatter_chunk_offset(aiter);
+ aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+ paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
+ }
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_func_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter aiter;
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ abd_iter_init(&aiter, abd);
+ abd_iter_advance(&aiter, off);
+
+ while (size > 0) {
+ abd_iter_map(&aiter);
+
+ size_t len = MIN(aiter.iter_mapsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_mapaddr, len, private);
+
+ abd_iter_unmap(&aiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (ret);
+}
+
+struct buf_arg {
+ void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(ba_ptr->arg_buf, buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+ &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+ int ret;
+ struct buf_arg *ba_ptr = private;
+
+ ret = memcmp(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+ &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+ (void) memset(buf, 0, size);
+ return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+ (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+ size_t size, abd_iter_func2_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter daiter, saiter;
+
+ abd_verify(dabd);
+ abd_verify(sabd);
+
+ ASSERT3U(doff + size, <=, dabd->abd_size);
+ ASSERT3U(soff + size, <=, sabd->abd_size);
+
+ abd_iter_init(&daiter, dabd);
+ abd_iter_init(&saiter, sabd);
+ abd_iter_advance(&daiter, doff);
+ abd_iter_advance(&saiter, soff);
+
+ while (size > 0) {
+ abd_iter_map(&daiter);
+ abd_iter_map(&saiter);
+
+ size_t dlen = MIN(daiter.iter_mapsize, size);
+ size_t slen = MIN(saiter.iter_mapsize, size);
+ size_t len = MIN(dlen, slen);
+ ASSERT(dlen > 0 || slen > 0);
+
+ ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+ private);
+
+ abd_iter_unmap(&saiter);
+ abd_iter_unmap(&daiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&daiter, len);
+ abd_iter_advance(&saiter, len);
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ (void) memcpy(dbuf, sbuf, size);
+ return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+ (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+ abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+ return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the contents of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd)
+{
+ ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
+ return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
+ abd_cmp_cb, NULL));
+}
+
+/*
+ * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @dabd data ABD. Can be NULL (in this case @dsize = 0)
+ * @func_raidz_gen should be implemented so that its behaviour
+ * is the same when taking linear and when taking scatter
+ */
+void
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t))
+{
+ int i;
+ ssize_t len, dlen;
+ struct abd_iter caiters[3];
+ struct abd_iter daiter = {0};
+ void *caddrs[3];
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++)
+ abd_iter_init(&caiters[i], cabds[i]);
+
+ if (dabd)
+ abd_iter_init(&daiter, dabd);
+
+ ASSERT3S(dsize, >=, 0);
+
+ critical_enter();
+ while (csize > 0) {
+ len = csize;
+
+ if (dabd && dsize > 0)
+ abd_iter_map(&daiter);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&caiters[i]);
+ caddrs[i] = caiters[i].iter_mapaddr;
+ }
+
+ switch (parity) {
+ case 3:
+ len = MIN(caiters[2].iter_mapsize, len);
+ case 2:
+ len = MIN(caiters[1].iter_mapsize, len);
+ case 1:
+ len = MIN(caiters[0].iter_mapsize, len);
+ }
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+
+ if (dabd && dsize > 0) {
+ /* this needs precise iter.length */
+ len = MIN(daiter.iter_mapsize, len);
+ dlen = len;
+ } else
+ dlen = 0;
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&caiters[i]);
+ abd_iter_advance(&caiters[i], len);
+ }
+
+ if (dabd && dsize > 0) {
+ abd_iter_unmap(&daiter);
+ abd_iter_advance(&daiter, dlen);
+ dsize -= dlen;
+ }
+
+ csize -= len;
+
+ ASSERT3S(dsize, >=, 0);
+ ASSERT3S(csize, >=, 0);
+ }
+ critical_exit();
+}
+
+/*
+ * Iterate over code ABDs and data reconstruction target ABDs and call
+ * @func_raidz_rec. Function maps at most 6 pages atomically.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @tabds rec target ABDs, at most 3
+ * @tsize size of data target columns
+ * @func_raidz_rec expects syndrome data in target columns. Function
+ * reconstructs data and overwrites target columns.
+ */
+void
+abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul)
+{
+ int i;
+ ssize_t len;
+ struct abd_iter citers[3];
+ struct abd_iter xiters[3];
+ void *caddrs[3], *xaddrs[3];
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_init(&citers[i], cabds[i]);
+ abd_iter_init(&xiters[i], tabds[i]);
+ }
+
+ critical_enter();
+ while (tsize > 0) {
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&citers[i]);
+ abd_iter_map(&xiters[i]);
+ caddrs[i] = citers[i].iter_mapaddr;
+ xaddrs[i] = xiters[i].iter_mapaddr;
+ }
+
+ len = tsize;
+ switch (parity) {
+ case 3:
+ len = MIN(xiters[2].iter_mapsize, len);
+ len = MIN(citers[2].iter_mapsize, len);
+ case 2:
+ len = MIN(xiters[1].iter_mapsize, len);
+ len = MIN(citers[1].iter_mapsize, len);
+ case 1:
+ len = MIN(xiters[0].iter_mapsize, len);
+ len = MIN(citers[0].iter_mapsize, len);
+ }
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_rec(xaddrs, len, caddrs, mul);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&xiters[i]);
+ abd_iter_unmap(&citers[i]);
+ abd_iter_advance(&xiters[i], len);
+ abd_iter_advance(&citers[i], len);
+ }
+
+ tsize -= len;
+ ASSERT3S(tsize, >=, 0);
+ }
+ critical_exit();
+}
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
new file mode 100644
index 000000000..f0c272447
--- /dev/null
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/eventhandler.h>
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/sdt.h>
+#include <sys/aggsum.h>
+#include <cityhash.h>
+
+extern struct vfsops zfs_vfsops;
+
+/* vmem_size typemask */
+#define VMEM_ALLOC 0x01
+#define VMEM_FREE 0x02
+#define VMEM_MAXFREE 0x10
+typedef size_t vmem_size_t;
+extern vmem_size_t vmem_size(vmem_t *vm, int typemask);
+
+uint_t zfs_arc_free_target = 0;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+int64_t
+arc_available_memory(void)
+{
+ int64_t lowest = INT64_MAX;
+ int64_t n __unused;
+ free_memory_reason_t r = FMR_UNKNOWN;
+
+#ifdef _KERNEL
+ /*
+ * Cooperate with pagedaemon when it's time for it to scan
+ * and reclaim some pages.
+ */
+ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_LOTSFREE;
+ }
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
+ * free)
+ */
+ n = uma_avail() - (long)(uma_limit() / 4);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_HEAP_ARENA;
+ }
+#endif
+
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this arena remains
+ * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
+ *
+ * Note that reducing the arc_zio_arena_free_shift keeps more virtual
+ * memory (in the zio_arena) free, which can avoid memory
+ * fragmentation issues.
+ */
+ if (zio_arena != NULL) {
+ n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
+ (vmem_size(zio_arena, VMEM_ALLOC) >>
+ arc_zio_arena_free_shift);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_ZIO_ARENA;
+ }
+ }
+
+#else /* _KERNEL */
+ /* Every 100 calls, free a small amount */
+ if (spa_get_random(100) == 0)
+ lowest = -1024;
+#endif /* _KERNEL */
+
+ last_free_memory = lowest;
+ last_free_reason = r;
+ DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+ return (lowest);
+}
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+ uint64_t size;
+
+ if (allmem >= 1 << 30)
+ size = allmem - (1 << 30);
+ else
+ size = min;
+ return (MAX(allmem * 5 / 8, size));
+}
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *arg)
+{
+ int64_t nr_scan = *(int64_t *)arg;
+
+ arc_reduce_target_size(ptob(nr_scan));
+ free(arg, M_TEMP);
+ vnlru_free(nr_scan, &zfs_vfsops);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference. This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread(). A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+
+ int64_t *adjustptr;
+
+ if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL)
+ return;
+
+ *adjustptr = adjust;
+ taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP);
+ ARCSTAT_BUMP(arcstat_prune);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+ return ((uint64_t)ptob(physmem));
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+ return (0);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+ /* XXX */
+ return (0);
+}
+
+static eventhandler_tag arc_event_lowmem = NULL;
+
+static void
+arc_lowmem(void *arg __unused, int howto __unused)
+{
+ int64_t free_memory, to_free;
+
+ arc_no_grow = B_TRUE;
+ arc_warm = B_TRUE;
+ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ free_memory = arc_available_memory();
+ to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
+ DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
+ arc_reduce_target_size(to_free);
+
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = B_TRUE;
+ zthr_wakeup(arc_adjust_zthr);
+
+ /*
+ * It is unsafe to block here in arbitrary threads, because we can come
+ * here from ARC itself and may hold ARC locks and thus risk a deadlock
+ * with ARC reclaim thread.
+ */
+ if (curproc == pageproc)
+ (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock);
+ mutex_exit(&arc_adjust_lock);
+}
+
+void
+arc_lowmem_init(void)
+{
+ arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
+ EVENTHANDLER_PRI_FIRST);
+
+}
+
+void
+arc_lowmem_fini(void)
+{
+ if (arc_event_lowmem != NULL)
+ EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+}
diff --git a/module/os/freebsd/zfs/crypto_os.c b/module/os/freebsd/zfs/crypto_os.c
new file mode 100644
index 000000000..cc86074c2
--- /dev/null
+++ b/module/os/freebsd/zfs/crypto_os.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2005-2010 Pawel Jakub Dawidek <[email protected]>
+ * Copyright (c) 2018 Sean Eric Fagan <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#ifdef _KERNEL
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/xform.h>
+#else
+#include <strings.h>
+#endif
+
+#include <sys/zio_crypt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+#include <sys/freebsd_crypto.h>
+
+#define SHA512_HMAC_BLOCK_SIZE 128
+
+static int crypt_sessions = 0;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD,
+ &crypt_sessions, 0, "Number of cryptographic sessions created");
+
+void
+crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key)
+{
+ uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE],
+ k_opad[SHA512_HMAC_BLOCK_SIZE],
+ key[SHA512_HMAC_BLOCK_SIZE];
+ SHA512_CTX lctx;
+ int i;
+ size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length);
+
+ /*
+ * This code is based on the similar code in geom/eli/g_eli_hmac.c
+ */
+ explicit_bzero(key, sizeof (key));
+ if (c_key->ck_length == 0)
+ /* do nothing */;
+ else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE)
+ bcopy(c_key->ck_data, key, cl_bytes);
+ else {
+ /*
+ * If key is longer than 128 bytes reset it to
+ * key = SHA512(key).
+ */
+ SHA512_Init(&lctx);
+ SHA512_Update(&lctx, c_key->ck_data, cl_bytes);
+ SHA512_Final(key, &lctx);
+ }
+
+ /* XOR key with ipad and opad values. */
+ for (i = 0; i < sizeof (key); i++) {
+ k_ipad[i] = key[i] ^ 0x36;
+ k_opad[i] = key[i] ^ 0x5c;
+ }
+ explicit_bzero(key, sizeof (key));
+
+ /* Start inner SHA512. */
+ SHA512_Init(&ctx->innerctx);
+ SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad));
+ explicit_bzero(k_ipad, sizeof (k_ipad));
+ /* Start outer SHA512. */
+ SHA512_Init(&ctx->outerctx);
+ SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad));
+ explicit_bzero(k_opad, sizeof (k_opad));
+}
+
+void
+crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize)
+{
+ SHA512_Update(&ctx->innerctx, data, datasize);
+}
+
+void
+crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize)
+{
+ uint8_t digest[SHA512_DIGEST_LENGTH];
+
+ /* Complete inner hash */
+ SHA512_Final(digest, &ctx->innerctx);
+
+ /* Complete outer hash */
+ SHA512_Update(&ctx->outerctx, digest, sizeof (digest));
+ SHA512_Final(digest, &ctx->outerctx);
+
+ explicit_bzero(ctx, sizeof (*ctx));
+ /* mdsize == 0 means "Give me the whole hash!" */
+ if (mdsize == 0)
+ mdsize = SHA512_DIGEST_LENGTH;
+ bcopy(digest, md, mdsize);
+ explicit_bzero(digest, sizeof (digest));
+}
+
+void
+crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size,
+ void *out_data, size_t out_data_size)
+{
+ struct hmac_ctx ctx;
+
+ crypto_mac_init(&ctx, key);
+ crypto_mac_update(&ctx, in_data, in_data_size);
+ crypto_mac_final(&ctx, out_data, out_data_size);
+}
+
+static int
+freebsd_zfs_crypt_done(struct cryptop *crp)
+{
+ freebsd_crypt_session_t *ses;
+
+ ses = crp->crp_opaque;
+ mtx_lock(&ses->fs_lock);
+ ses->fs_done = true;
+ mtx_unlock(&ses->fs_lock);
+ wakeup(crp);
+ return (0);
+}
+
+void
+freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
+{
+ mtx_destroy(&sess->fs_lock);
+ crypto_freesession(sess->fs_sid);
+ explicit_bzero(sess, sizeof (*sess));
+}
+
+static int
+zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp)
+{
+ int error;
+
+ crp->crp_opaque = session;
+ crp->crp_callback = freebsd_zfs_crypt_done;
+ for (;;) {
+ error = crypto_dispatch(crp);
+ if (error)
+ break;
+ mtx_lock(&session->fs_lock);
+ while (session->fs_done == false)
+ msleep(crp, &session->fs_lock, PRIBIO,
+ "zfs_crypto", hz/5);
+ mtx_unlock(&session->fs_lock);
+
+ if (crp->crp_etype != EAGAIN) {
+ error = crp->crp_etype;
+ break;
+ }
+ crp->crp_etype = 0;
+ crp->crp_flags &= ~CRYPTO_F_DONE;
+ session->fs_done = false;
+#if __FreeBSD_version < 1300087
+ /*
+ * Session ID changed, so we should record that,
+ * and try again
+ */
+ session->fs_sid = crp->crp_session;
+#endif
+ }
+ return (error);
+}
+static void
+freebsd_crypt_uio_debug_log(boolean_t encrypt,
+ freebsd_crypt_session_t *input_sessionp,
+ struct zio_crypt_info *c_info,
+ uio_t *data_uio,
+ crypto_key_t *key,
+ uint8_t *ivbuf,
+ size_t datalen,
+ size_t auth_len)
+{
+#ifdef FCRYPTO_DEBUG
+ struct cryptodesc *crd;
+ uint8_t *p = NULL;
+ size_t total = 0;
+
+ printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, "
+ "%p, %u, %u)\n",
+ __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp,
+ c_info->ci_algname, c_info->ci_crypt_type,
+ (unsigned int)c_info->ci_keylen, c_info->ci_name,
+ data_uio, key->ck_format, key->ck_data,
+ (unsigned int)key->ck_length,
+ ivbuf, (unsigned int)datalen, (unsigned int)auth_len);
+ printf("\tkey = { ");
+ for (int i = 0; i < key->ck_length / 8; i++) {
+ uint8_t *b = (uint8_t *)key->ck_data;
+ printf("%02x ", b[i]);
+ }
+ printf("}\n");
+ for (int i = 0; i < data_uio->uio_iovcnt; i++) {
+ printf("\tiovec #%d: <%p, %u>\n", i,
+ data_uio->uio_iov[i].iov_base,
+ (unsigned int)data_uio->uio_iov[i].iov_len);
+ total += data_uio->uio_iov[i].iov_len;
+ }
+ data_uio->uio_resid = total;
+#endif
+}
+/*
+ * Create a new cryptographic session. This should
+ * happen every time the key changes (including when
+ * it's first loaded).
+ */
+#if __FreeBSD_version >= 1300087
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+ struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+ struct crypto_session_params csp;
+ int error = 0;
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+ __FUNCTION__, sessp,
+ c_info->ci_algname, c_info->ci_crypt_type,
+ (unsigned int)c_info->ci_keylen, c_info->ci_name,
+ key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+ printf("\tkey = { ");
+ for (int i = 0; i < key->ck_length / 8; i++) {
+ uint8_t *b = (uint8_t *)key->ck_data;
+ printf("%02x ", b[i]);
+ }
+ printf("}\n");
+#endif
+ bzero(&csp, sizeof (csp));
+ csp.csp_mode = CSP_MODE_AEAD;
+ csp.csp_cipher_key = key->ck_data;
+ csp.csp_cipher_klen = key->ck_length / 8;
+ switch (c_info->ci_crypt_type) {
+ case ZC_TYPE_GCM:
+ csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
+ csp.csp_ivlen = AES_GCM_IV_LEN;
+ switch (key->ck_length/8) {
+ case AES_128_GMAC_KEY_LEN:
+ case AES_192_GMAC_KEY_LEN:
+ case AES_256_GMAC_KEY_LEN:
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case ZC_TYPE_CCM:
+ csp.csp_cipher_alg = CRYPTO_AES_CCM_16;
+ csp.csp_ivlen = AES_CCM_IV_LEN;
+ switch (key->ck_length/8) {
+ case AES_128_CBC_MAC_KEY_LEN:
+ case AES_192_CBC_MAC_KEY_LEN:
+ case AES_256_CBC_MAC_KEY_LEN:
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ break;
+ }
+ break;
+ default:
+ error = ENOTSUP;
+ goto bad;
+ }
+ error = crypto_newsession(&sessp->fs_sid, &csp,
+ CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+ mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+ NULL, MTX_DEF);
+ crypt_sessions++;
+bad:
+#ifdef FCRYPTO_DEBUG
+ if (error)
+ printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+ return (error);
+}
+
+int
+freebsd_crypt_uio(boolean_t encrypt,
+ freebsd_crypt_session_t *input_sessionp,
+ struct zio_crypt_info *c_info,
+ uio_t *data_uio,
+ crypto_key_t *key,
+ uint8_t *ivbuf,
+ size_t datalen,
+ size_t auth_len)
+{
+ struct cryptop *crp;
+ freebsd_crypt_session_t *session = NULL;
+ int error = 0;
+ size_t total = 0;
+
+ freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+ key, ivbuf, datalen, auth_len);
+ for (int i = 0; i < data_uio->uio_iovcnt; i++)
+ total += data_uio->uio_iov[i].iov_len;
+ data_uio->uio_resid = total;
+ if (input_sessionp == NULL) {
+ session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+ error = freebsd_crypt_newsession(session, c_info, key);
+ if (error)
+ goto out;
+ } else
+ session = input_sessionp;
+
+ crp = crypto_getreq(session->fs_sid, M_WAITOK);
+ if (encrypt) {
+ crp->crp_op = CRYPTO_OP_ENCRYPT |
+ CRYPTO_OP_COMPUTE_DIGEST;
+ } else {
+ crp->crp_op = CRYPTO_OP_DECRYPT |
+ CRYPTO_OP_VERIFY_DIGEST;
+ }
+ crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE;
+ crp->crp_buf_type = CRYPTO_BUF_UIO;
+ crp->crp_uio = (void*)data_uio;
+ crp->crp_ilen = data_uio->uio_resid;
+
+ crp->crp_aad_start = 0;
+ crp->crp_aad_length = auth_len;
+ crp->crp_payload_start = auth_len;
+ crp->crp_payload_length = datalen;
+ crp->crp_digest_start = auth_len + datalen;
+
+ bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN);
+ error = zfs_crypto_dispatch(session, crp);
+ crypto_freereq(crp);
+out:
+#ifdef FCRYPTO_DEBUG
+ if (error)
+ printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+ if (input_sessionp == NULL) {
+ freebsd_crypt_freesession(session);
+ kmem_free(session, sizeof (*session));
+ }
+ return (error);
+}
+
+#else
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+ struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+ struct cryptoini cria, crie, *crip;
+ struct enc_xform *xform;
+ struct auth_hash *xauth;
+ int error = 0;
+ crypto_session_t sid;
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+ __FUNCTION__, sessp,
+ c_info->ci_algname, c_info->ci_crypt_type,
+ (unsigned int)c_info->ci_keylen, c_info->ci_name,
+ key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+ printf("\tkey = { ");
+ for (int i = 0; i < key->ck_length / 8; i++) {
+ uint8_t *b = (uint8_t *)key->ck_data;
+ printf("%02x ", b[i]);
+ }
+ printf("}\n");
+#endif
+ switch (c_info->ci_crypt_type) {
+ case ZC_TYPE_GCM:
+ xform = &enc_xform_aes_nist_gcm;
+ switch (key->ck_length/8) {
+ case AES_128_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_128;
+ break;
+ case AES_192_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_192;
+ break;
+ case AES_256_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case ZC_TYPE_CCM:
+ xform = &enc_xform_ccm;
+ switch (key->ck_length/8) {
+ case AES_128_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_128;
+ break;
+ case AES_192_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_192;
+ break;
+ case AES_256_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ break;
+ }
+ break;
+ default:
+ error = ENOTSUP;
+ goto bad;
+ }
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+ "auth %s (key length %d)\n",
+ __FUNCTION__, __LINE__,
+ xform->name, (unsigned int)key->ck_length,
+ (unsigned int)key->ck_length/8,
+ xauth->name, xauth->keysize);
+#endif
+
+ bzero(&crie, sizeof (crie));
+ bzero(&cria, sizeof (cria));
+
+ crie.cri_alg = xform->type;
+ crie.cri_key = key->ck_data;
+ crie.cri_klen = key->ck_length;
+
+ cria.cri_alg = xauth->type;
+ cria.cri_key = key->ck_data;
+ cria.cri_klen = key->ck_length;
+
+ cria.cri_next = &crie;
+ crie.cri_next = NULL;
+ crip = &cria;
+ // Everything else is bzero'd
+
+ error = crypto_newsession(&sid, crip,
+ CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+ if (error != 0) {
+ printf("%s(%d): crypto_newsession failed with %d\n",
+ __FUNCTION__, __LINE__, error);
+ goto bad;
+ }
+ sessp->fs_sid = sid;
+ mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+ NULL, MTX_DEF);
+ crypt_sessions++;
+bad:
+ return (error);
+}
+
+/*
+ * The meat of encryption/decryption.
+ * If sessp is NULL, then it will create a
+ * temporary cryptographic session, and release
+ * it when done.
+ */
+int
+freebsd_crypt_uio(boolean_t encrypt,
+ freebsd_crypt_session_t *input_sessionp,
+ struct zio_crypt_info *c_info,
+ uio_t *data_uio,
+ crypto_key_t *key,
+ uint8_t *ivbuf,
+ size_t datalen,
+ size_t auth_len)
+{
+ struct cryptop *crp;
+ struct cryptodesc *enc_desc, *auth_desc;
+ struct enc_xform *xform;
+ struct auth_hash *xauth;
+ freebsd_crypt_session_t *session = NULL;
+ int error;
+
+ freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+ key, ivbuf, datalen, auth_len);
+ switch (c_info->ci_crypt_type) {
+ case ZC_TYPE_GCM:
+ xform = &enc_xform_aes_nist_gcm;
+ switch (key->ck_length/8) {
+ case AES_128_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_128;
+ break;
+ case AES_192_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_192;
+ break;
+ case AES_256_GMAC_KEY_LEN:
+ xauth = &auth_hash_nist_gmac_aes_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case ZC_TYPE_CCM:
+ xform = &enc_xform_ccm;
+ switch (key->ck_length/8) {
+ case AES_128_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_128;
+ break;
+ case AES_192_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_192;
+ break;
+ case AES_256_CBC_MAC_KEY_LEN:
+ xauth = &auth_hash_ccm_cbc_mac_256;
+ break;
+ default:
+ error = EINVAL;
+ goto bad;
+ break;
+ }
+ break;
+ default:
+ error = ENOTSUP;
+ goto bad;
+ }
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+ "auth %s (key length %d)\n",
+ __FUNCTION__, __LINE__,
+ xform->name, (unsigned int)key->ck_length,
+ (unsigned int)key->ck_length/8,
+ xauth->name, xauth->keysize);
+#endif
+
+ if (input_sessionp == NULL) {
+ session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+ error = freebsd_crypt_newsession(session, c_info, key);
+ if (error)
+ goto out;
+ } else
+ session = input_sessionp;
+
+ crp = crypto_getreq(2);
+ if (crp == NULL) {
+ error = ENOMEM;
+ goto bad;
+ }
+
+ auth_desc = crp->crp_desc;
+ enc_desc = auth_desc->crd_next;
+
+ crp->crp_session = session->fs_sid;
+ crp->crp_ilen = auth_len + datalen;
+ crp->crp_buf = (void*)data_uio;
+ crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC;
+
+ auth_desc->crd_skip = 0;
+ auth_desc->crd_len = auth_len;
+ auth_desc->crd_inject = auth_len + datalen;
+ auth_desc->crd_alg = xauth->type;
+#ifdef FCRYPTO_DEBUG
+ printf("%s: auth: skip = %u, len = %u, inject = %u\n",
+ __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len,
+ auth_desc->crd_inject);
+#endif
+
+ enc_desc->crd_skip = auth_len;
+ enc_desc->crd_len = datalen;
+ enc_desc->crd_inject = auth_len;
+ enc_desc->crd_alg = xform->type;
+ enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT;
+ bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN);
+ enc_desc->crd_next = NULL;
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s: enc: skip = %u, len = %u, inject = %u\n",
+ __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len,
+ enc_desc->crd_inject);
+#endif
+
+ if (encrypt)
+ enc_desc->crd_flags |= CRD_F_ENCRYPT;
+
+ error = zfs_crypto_dispatch(session, crp);
+ crypto_freereq(crp);
+out:
+ if (input_sessionp == NULL) {
+ freebsd_crypt_freesession(session);
+ kmem_free(session, sizeof (*session));
+ }
+bad:
+#ifdef FCRYPTO_DEBUG
+ if (error)
+ printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+ return (error);
+}
+#endif
diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c
new file mode 100644
index 000000000..268c843e5
--- /dev/null
+++ b/module/os/freebsd/zfs/dmu_os.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/zfs_rlock.h>
+#include <sys/racct.h>
+#include <sys/vm.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+
+
+#ifndef IDX_TO_OFF
+#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
+#endif
+
+#if __FreeBSD_version < 1300051
+#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY
+#else
+#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY
+#endif
+
+
+#if __FreeBSD_version < 1300072
+#define dmu_page_lock(m) vm_page_lock(m)
+#define dmu_page_unlock(m) vm_page_unlock(m)
+#else
+#define dmu_page_lock(m)
+#define dmu_page_unlock(m)
+#endif
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ vm_page_t *ma, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ struct sf_buf *sf;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(ptoa((*ma)->pindex), ==,
+ db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = zfs_map_page(*ma, &sf);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ zfs_unmap_page(sf);
+ ma += 1;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+ int *rbehind, int *rahead, int last_size)
+{
+ struct sf_buf *sf;
+ vm_object_t vmobj;
+ vm_page_t m;
+ dmu_buf_t **dbp;
+ dmu_buf_t *db;
+ caddr_t va;
+ int numbufs, i;
+ int bufoff, pgoff, tocpy;
+ int mi, di;
+ int err;
+
+ ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+ ASSERT(last_size <= PAGE_SIZE);
+
+ err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+ IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+ if (err != 0)
+ return (err);
+
+#ifdef DEBUG
+ IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+ if (dbp[0]->db_offset != 0 || numbufs > 1) {
+ for (i = 0; i < numbufs; i++) {
+ ASSERT(ISP2(dbp[i]->db_size));
+ ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+ ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+ }
+ }
+#endif
+
+ vmobj = ma[0]->object;
+ zfs_vmobject_wlock(vmobj);
+
+ db = dbp[0];
+ for (i = 0; i < *rbehind; i++) {
+ m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+ if (m == NULL)
+ break;
+ if (!vm_page_none_valid(m)) {
+ ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_do_sunbusy(m);
+ break;
+ }
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_mapped(m));
+
+ ASSERT(db->db_size > PAGE_SIZE);
+ bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+ va = zfs_map_page(m, &sf);
+ bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ dmu_page_lock(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ dmu_page_unlock(m);
+ vm_page_do_sunbusy(m);
+ }
+ *rbehind = i;
+
+ bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+ pgoff = 0;
+ for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+ if (pgoff == 0) {
+ m = ma[mi];
+ if (m != bogus_page) {
+ vm_page_assert_xbusied(m);
+ ASSERT(vm_page_none_valid(m));
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_mapped(m));
+ va = zfs_map_page(m, &sf);
+ }
+ }
+ if (bufoff == 0)
+ db = dbp[di];
+
+ if (m != bogus_page) {
+ ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+ db->db_offset + bufoff);
+ }
+
+ /*
+ * We do not need to clamp the copy size by the file
+ * size as the last block is zero-filled beyond the
+ * end of file anyway.
+ */
+ tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+ if (m != bogus_page)
+ bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+ pgoff += tocpy;
+ ASSERT(pgoff <= PAGESIZE);
+ if (pgoff == PAGESIZE) {
+ if (m != bogus_page) {
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ }
+ ASSERT(mi < count);
+ mi++;
+ pgoff = 0;
+ }
+
+ bufoff += tocpy;
+ ASSERT(bufoff <= db->db_size);
+ if (bufoff == db->db_size) {
+ ASSERT(di < numbufs);
+ di++;
+ bufoff = 0;
+ }
+ }
+
+#ifdef DEBUG
+ /*
+ * Three possibilities:
+ * - last requested page ends at a buffer boundary and , thus,
+ * all pages and buffers have been iterated;
+ * - all requested pages are filled, but the last buffer
+ * has not been exhausted;
+ * the read-ahead is possible only in this case;
+ * - all buffers have been read, but the last page has not been
+ * fully filled;
+ * this is only possible if the file has only a single buffer
+ * with a size that is not a multiple of the page size.
+ */
+ if (mi == count) {
+ ASSERT(di >= numbufs - 1);
+ IMPLY(*rahead != 0, di == numbufs - 1);
+ IMPLY(*rahead != 0, bufoff != 0);
+ ASSERT(pgoff == 0);
+ }
+ if (di == numbufs) {
+ ASSERT(mi >= count - 1);
+ ASSERT(*rahead == 0);
+ IMPLY(pgoff == 0, mi == count);
+ if (pgoff != 0) {
+ ASSERT(mi == count - 1);
+ ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+ }
+ }
+#endif
+ if (pgoff != 0) {
+ ASSERT(m != bogus_page);
+ bzero(va + pgoff, PAGESIZE - pgoff);
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ }
+
+ for (i = 0; i < *rahead; i++) {
+ m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+ if (m == NULL)
+ break;
+ if (!vm_page_none_valid(m)) {
+ ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_do_sunbusy(m);
+ break;
+ }
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_mapped(m));
+
+ ASSERT(db->db_size > PAGE_SIZE);
+ bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+ tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+ va = zfs_map_page(m, &sf);
+ bcopy((char *)db->db_data + bufoff, va, tocpy);
+ if (tocpy < PAGESIZE) {
+ ASSERT(i == *rahead - 1);
+ ASSERT((db->db_size & PAGE_MASK) != 0);
+ bzero(va + tocpy, PAGESIZE - tocpy);
+ }
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ dmu_page_lock(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ dmu_page_unlock(m);
+ vm_page_do_sunbusy(m);
+ }
+ *rahead = i;
+ zfs_vmobject_wunlock(vmobj);
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (0);
+}
diff --git a/module/os/freebsd/zfs/hkdf.c b/module/os/freebsd/zfs/hkdf.c
new file mode 100644
index 000000000..8324ff231
--- /dev/null
+++ b/module/os/freebsd/zfs/hkdf.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/hkdf.h>
+#include <sys/freebsd_crypto.h>
+#include <sys/hkdf.h>
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+ uint_t km_len, uint8_t *out_buf)
+{
+ crypto_key_t key;
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = CRYPTO_BYTES2BITS(salt_len);
+ key.ck_data = salt;
+
+ crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH);
+
+ return (0);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+ uint8_t *out_buf, uint_t out_len)
+{
+ struct hmac_ctx ctx;
+ crypto_key_t key;
+ uint_t i, T_len = 0, pos = 0;
+ uint8_t c;
+ uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH;
+ uint8_t T[SHA512_DIGEST_LENGTH];
+
+ if (N > 255)
+ return (SET_ERROR(EINVAL));
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
+ key.ck_data = extract_key;
+
+ for (i = 1; i <= N; i++) {
+ c = i;
+
+ crypto_mac_init(&ctx, &key);
+ crypto_mac_update(&ctx, T, T_len);
+ crypto_mac_update(&ctx, info, info_len);
+ crypto_mac_update(&ctx, &c, 1);
+ crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH);
+ bcopy(T, out_buf + pos,
+ (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
+ pos += SHA512_DIGEST_LENGTH;
+ }
+
+ return (0);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+ uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+ uint_t out_len)
+{
+ int ret;
+ uint8_t extract_key[SHA512_DIGEST_LENGTH];
+
+ ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+ extract_key);
+ if (ret != 0)
+ return (ret);
+
+ ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+ out_len);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c
new file mode 100644
index 000000000..10807afa3
--- /dev/null
+++ b/module/os/freebsd/zfs/kmod_core.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/fm/util.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_ioctl_impl.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_DECL(_vfs_zfs_vdev);
+
+
+static int zfs_version_ioctl = ZFS_IOCVER_ZOF;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+ 0, "ZFS_IOCTL_VERSION");
+
+static struct cdev *zfsdev;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+extern void zfs_ioctl_init(void);
+
+
+static struct root_hold_token *zfs_root_token;
+
+extern uint_t rrw_tsd_key;
+extern uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
+
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+extern zfsdev_state_t *zfsdev_state_list;
+
+#define ZFS_MIN_KSTACK_PAGES 4
+
+static void
+zfs_cmd_bsd12_to_zof(zfs_cmd_legacy_t *src, zfs_cmd_t *dst)
+{
+ memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+ *&dst->zc_objset_stats = *&src->zc_objset_stats;
+ memcpy(&dst->zc_begin_record, &src->zc_begin_record,
+ offsetof(zfs_cmd_t, zc_sendobj) -
+ offsetof(zfs_cmd_t, zc_begin_record));
+ memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+ sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+ dst->zc_zoneid = src->zc_jailid;
+}
+
+static void
+zfs_cmd_zof_to_bsd12(zfs_cmd_t *src, zfs_cmd_legacy_t *dst)
+{
+ memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+ *&dst->zc_objset_stats = *&src->zc_objset_stats;
+ memcpy(&dst->zc_begin_record, &src->zc_begin_record,
+ offsetof(zfs_cmd_t, zc_sendobj) -
+ offsetof(zfs_cmd_t, zc_begin_record));
+ memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+ sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+ dst->zc_jailid = src->zc_zoneid;
+}
+
+static int
+zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ uint_t len, vecnum;
+ zfs_iocparm_t *zp;
+ zfs_cmd_t *zc;
+ zfs_cmd_legacy_t *zcl;
+ int rc, error;
+ void *uaddr;
+
+ len = IOCPARM_LEN(zcmd);
+ vecnum = zcmd & 0xff;
+ zp = (void *)arg;
+ uaddr = (void *)zp->zfs_cmd;
+ error = 0;
+ zcl = NULL;
+
+ if (len != sizeof (zfs_iocparm_t)) {
+ printf("len %d vecnum: %d sizeof (zfs_cmd_t) %lu\n",
+ len, vecnum, sizeof (zfs_cmd_t));
+ return (EINVAL);
+ }
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ /*
+ * Remap ioctl code for legacy user binaries
+ */
+ if (zp->zfs_ioctl_version == ZFS_IOCVER_FREEBSD) {
+ if (vecnum >= sizeof (zfs_ioctl_bsd12_to_zof)/sizeof (long)) {
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ return (ENOTSUP);
+ }
+ zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
+ vecnum = zfs_ioctl_bsd12_to_zof[vecnum];
+ if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ zfs_cmd_bsd12_to_zof(zcl, zc);
+ } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ error = zfsdev_ioctl_common(vecnum, zc);
+ if (zcl) {
+ zfs_cmd_zof_to_bsd12(zc, zcl);
+ rc = copyout(zcl, uaddr, sizeof (*zcl));
+ } else {
+ rc = copyout(zc, uaddr, sizeof (*zc));
+ }
+ if (error == 0 && rc != 0)
+ error = SET_ERROR(EFAULT);
+out:
+ if (zcl)
+ kmem_free(zcl, sizeof (zfs_cmd_legacy_t));
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+ zfsdev_state_t *zs, *zsp = data;
+
+ mutex_enter(&zfsdev_state_lock);
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ if (zs == zsp)
+ break;
+ }
+ if (zs == NULL || zs->zs_minor <= 0) {
+ mutex_exit(&zfsdev_state_lock);
+ return;
+ }
+ zs->zs_minor = -1;
+ zfs_onexit_destroy(zs->zs_onexit);
+ zfs_zevent_destroy(zs->zs_zevent);
+ mutex_exit(&zfsdev_state_lock);
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+ boolean_t newzs = B_FALSE;
+ minor_t minor;
+ zfsdev_state_t *zs, *zsprev = NULL;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (SET_ERROR(ENXIO));
+
+ for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ if (zs->zs_minor == -1)
+ break;
+ zsprev = zs;
+ }
+
+ if (!zs) {
+ zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+ newzs = B_TRUE;
+ }
+
+ devfs_set_cdevpriv(zs, zfsdev_close);
+
+ zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+ zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+ if (newzs) {
+ zs->zs_minor = minor;
+ wmb();
+ zsprev->zs_next = zs;
+ } else {
+ wmb();
+ zs->zs_minor = minor;
+ }
+ return (0);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+ int error;
+
+ mutex_enter(&zfsdev_state_lock);
+ error = zfs_ctldev_init(devp);
+ mutex_exit(&zfsdev_state_lock);
+
+ return (error);
+}
+
+static struct cdevsw zfs_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = zfsdev_open,
+ .d_ioctl = zfsdev_ioctl,
+ .d_name = ZFS_DRIVER
+};
+
+int
+zfsdev_attach(void)
+{
+ zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
+ ZFS_DRIVER);
+ return (0);
+}
+
+void
+zfsdev_detach(void)
+{
+ if (zfsdev != NULL)
+ destroy_dev(zfsdev);
+}
+
+int
+zfs__init(void)
+{
+ int error;
+
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+ printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+ "overflow panic!\nPlease consider adding "
+ "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+ ZFS_MIN_KSTACK_PAGES);
+#endif
+ zfs_root_token = root_mount_hold("ZFS");
+ if ((error = zfs_kmod_init()) != 0) {
+ printf("ZFS: Failed to Load ZFS Filesystem"
+ ", rc = %d\n", error);
+ root_mount_rel(zfs_root_token);
+ return (error);
+ }
+
+
+ tsd_create(&zfs_geom_probe_vdev_key, NULL);
+
+ printf("ZFS storage pool version: features support ("
+ SPA_VERSION_STRING ")\n");
+ root_mount_rel(zfs_root_token);
+ ddi_sysevent_init();
+ return (0);
+}
+
+int
+zfs__fini(void)
+{
+ if (zfs_busy() || zvol_busy() ||
+ zio_injection_enabled) {
+ return (EBUSY);
+ }
+ zfs_kmod_fini();
+ tsd_destroy(&zfs_geom_probe_vdev_key);
+ return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+ /*
+ * ZFS fini routines can not properly work in a panic-ed system.
+ */
+ if (panicstr == NULL)
+ zfs__fini();
+}
+
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+ int err;
+
+ switch (type) {
+ case MOD_LOAD:
+ err = zfs__init();
+ if (err == 0)
+ zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+ shutdown_post_sync, zfs_shutdown, NULL,
+ SHUTDOWN_PRI_FIRST);
+ return (err);
+ case MOD_UNLOAD:
+ err = zfs__fini();
+ if (err == 0 && zfs_shutdown_event_tag != NULL)
+ EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+ zfs_shutdown_event_tag);
+ return (err);
+ case MOD_SHUTDOWN:
+ return (0);
+ default:
+ break;
+ }
+ return (EOPNOTSUPP);
+}
+
+static moduledata_t zfs_mod = {
+ "zfsctrl",
+ zfs_modevent,
+ 0
+};
+
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY);
+MODULE_VERSION(zfsctrl, 1);
+MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, cryptodev, 1, 1, 1);
diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c
new file mode 100644
index 000000000..ed124a5fa
--- /dev/null
+++ b/module/os/freebsd/zfs/spa_os.c
@@ -0,0 +1,280 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <[email protected]>. All rights reserved.
+ */
+
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zvol.h>
+#include <sys/abd.h>
+#include <sys/callb.h>
+#include <sys/zone.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
+ uint64_t *count);
+
+static nvlist_t *
+spa_generate_rootconf(const char *name)
+{
+ nvlist_t **configs, **tops;
+ nvlist_t *config;
+ nvlist_t *best_cfg, *nvtop, *nvroot;
+ uint64_t *holes;
+ uint64_t best_txg;
+ uint64_t nchildren;
+ uint64_t pgid;
+ uint64_t count;
+ uint64_t i;
+ uint_t nholes;
+
+ if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
+ return (NULL);
+
+ ASSERT3U(count, !=, 0);
+ best_txg = 0;
+ for (i = 0; i < count; i++) {
+ uint64_t txg;
+
+ VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
+ &txg) == 0);
+ if (txg > best_txg) {
+ best_txg = txg;
+ best_cfg = configs[i];
+ }
+ }
+
+ nchildren = 1;
+ nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
+ holes = NULL;
+ nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
+ &holes, &nholes);
+
+ tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < nchildren; i++) {
+ if (i >= count)
+ break;
+ if (configs[i] == NULL)
+ continue;
+ VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ nvlist_dup(nvtop, &tops[i], KM_SLEEP);
+ }
+ for (i = 0; holes != NULL && i < nholes; i++) {
+ if (i >= nchildren)
+ continue;
+ if (tops[holes[i]] != NULL)
+ continue;
+ nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
+ VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) == 0);
+ VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
+ holes[i]) == 0);
+ VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
+ 0) == 0);
+ }
+ for (i = 0; i < nchildren; i++) {
+ if (tops[i] != NULL)
+ continue;
+ nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
+ VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MISSING) == 0);
+ VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
+ i) == 0);
+ VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
+ 0) == 0);
+ }
+
+ /*
+ * Create pool config based on the best vdev config.
+ */
+ nvlist_dup(best_cfg, &config, KM_SLEEP);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ tops, nchildren) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+
+ /*
+ * Drop vdev config elements that should not be present at pool level.
+ */
+ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
+ nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
+
+ for (i = 0; i < count; i++)
+ nvlist_free(configs[i]);
+ kmem_free(configs, count * sizeof (void *));
+ for (i = 0; i < nchildren; i++)
+ nvlist_free(tops[i]);
+ kmem_free(tops, nchildren * sizeof (void *));
+ nvlist_free(nvroot);
+ return (config);
+}
+
+int
+spa_import_rootpool(const char *name)
+{
+ spa_t *spa;
+ vdev_t *rvd;
+ nvlist_t *config, *nvtop;
+ uint64_t txg;
+ char *pname;
+ int error;
+
+ /*
+ * Read the label from the boot device and generate a configuration.
+ */
+ config = spa_generate_rootconf(name);
+
+ mutex_enter(&spa_namespace_lock);
+ if (config != NULL) {
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0 && strcmp(name, pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
+ == 0);
+
+ if ((spa = spa_lookup(pname)) != NULL) {
+ /*
+ * The pool could already be imported,
+ * e.g., after reboot -r.
+ */
+ if (spa->spa_state == POOL_STATE_ACTIVE) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ return (0);
+ }
+
+ /*
+ * Remove the existing root pool from the namespace so
+ * that we can replace it with the correct config
+ * we just read in.
+ */
+ spa_remove(spa);
+ }
+ spa = spa_add(pname, config, NULL);
+
+ /*
+ * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
+ * via spa_version().
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+ } else if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
+ name);
+ return (EIO);
+ } else {
+ VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
+ }
+ spa->spa_is_root = B_TRUE;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
+
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(config);
+ return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+ return ("freebsd");
+}
diff --git a/module/os/freebsd/zfs/spa_stats.c b/module/os/freebsd/zfs/spa_stats.c
new file mode 100644
index 000000000..45c880ada
--- /dev/null
+++ b/module/os/freebsd/zfs/spa_stats.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
+
+void
+spa_stats_init(spa_t *spa)
+{
+
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+
+}
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+ uint64_t extents_written, uint64_t bytes_written,
+ uint64_t extents_skipped, uint64_t bytes_skipped,
+ uint64_t extents_failed, uint64_t bytes_failed)
+{
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
+{
+}
+
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
+{
+
+}
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+ hrtime_t completed_time)
+{
+ return (0);
+}
+
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+ return (NULL);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+
+}
+
+void
+spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+ uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
+ int error)
+{
+
+}
+
+int
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
+ hrtime_t duration)
+{
+ return (0);
+}
+
+int
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
+{
+ return (0);
+}
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
new file mode 100644
index 000000000..ea9c1b3f1
--- /dev/null
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_context.h>
+
+#include <sys/arc_impl.h>
+#include <sys/dsl_pool.h>
+
+#include <../zfs_config.h>
+
+/* BEGIN CSTYLED */
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS events");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "space allocation");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "reconstruct");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "multihost protection");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "metaslab group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "lua");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "l2arc");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS Adaptive Replacement Cache");
+
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+ "ZFS VDEV Mirror");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "livelist state");
+SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "condense knobs");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "receive knobs");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "send knobs");
+
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
+ (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
+
+extern arc_state_t ARC_anon;
+extern arc_state_t ARC_mru;
+extern arc_state_t ARC_mru_ghost;
+extern arc_state_t ARC_mfu;
+extern arc_state_t ARC_mfu_ghost;
+extern arc_state_t ARC_l2c_only;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+
+/* arc.c */
+
+/* legacy compat */
+extern unsigned long l2arc_write_max; /* def max write size */
+extern unsigned long l2arc_write_boost; /* extra warmup write */
+extern unsigned long l2arc_headroom; /* # of dev writes */
+extern unsigned long l2arc_headroom_boost;
+extern unsigned long l2arc_feed_secs; /* interval seconds */
+extern unsigned long l2arc_feed_min_ms; /* min interval msecs */
+extern int l2arc_noprefetch; /* don't cache prefetch bufs */
+extern int l2arc_feed_again; /* turbo warmup */
+extern int l2arc_norw; /* no reads during writes */
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+ &l2arc_write_max, 0, "max write size (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+ &l2arc_write_boost, 0, "extra write during warmup (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+ &l2arc_headroom, 0, "number of dev writes (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+ &l2arc_feed_secs, 0, "interval seconds (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
+ &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+ &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
+ &l2arc_feed_again, 0, "turbo warmup (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
+ &l2arc_norw, 0, "no reads during writes (LEGACY)");
+#if 0
+extern int zfs_compressed_arc_enabled;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW,
+ &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)");
+#endif
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+ &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of anonymous state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+ &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+ &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+ &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+
+extern int arc_no_grow_shift;
+extern int arc_shrink_shift;
+
+extern arc_stats_t arc_stats;
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */
+#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
+#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
+#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
+
+static int
+sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t val;
+ int err;
+
+ val = arc_no_grow_shift;
+ err = sysctl_handle_32(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val >= arc_shrink_shift)
+ return (EINVAL);
+
+ arc_no_grow_shift = val;
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN,
+ 0, sizeof (uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
+ "log2(fraction of ARC which must be free to allow growing)");
+/* dbuf.c */
+
+
+/* dmu.c */
+
+/* dmu_zfetch.c */
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
+
+/* max bytes to prefetch per stream (default 8MB) */
+extern uint32_t zfetch_max_distance;
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+ &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)");
+
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+extern uint32_t zfetch_max_idistance;
+SYSCTL_UINT(_vfs_zfs_prefetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
+ &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
+
+/* dsl_pool.c */
+
+/* dnode.c */
+extern int zfs_default_bs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
+ &zfs_default_bs, 0, "Default dnode block shift");
+
+extern int zfs_default_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
+ &zfs_default_ibs, 0, "Default dnode indirect block shift");
+
+
+/* dsl_scan.c */
+
+/* metaslab.c */
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+extern boolean_t metaslab_lba_weighting_enabled;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting, CTLFLAG_RWTUN,
+ &metaslab_lba_weighting_enabled, 0,
+ "Enable LBA weighting (i.e. outer tracks are given preference)");
+
+
+/*
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
+ */
+extern int zfs_metaslab_sm_blksz_no_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN,
+ &zfs_metaslab_sm_blksz_no_log, 0,
+ "Block size for space map in pools with log space map disabled. "
+ "Power of 2 and greater than 4096.");
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+extern int zfs_metaslab_sm_blksz_with_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN,
+ &zfs_metaslab_sm_blksz_with_log, 0,
+ "Block size for space map in pools with log space map enabled. "
+ "Power of 2 and greater than 4096.");
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+extern int zfs_condense_pct;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+ &zfs_condense_pct, 0,
+ "Condense on-disk spacemap when it is more than this many percents"
+ " of in-memory counterpart");
+
+extern int zfs_remove_max_segment;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN,
+ &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to"
+ " allocate when removing a device");
+
+extern int zfs_removal_suspend_progress;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN,
+ &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while"
+ " in the middle of a removal");
+
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+extern uint64_t metaslab_df_alloc_threshold;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+ &metaslab_df_alloc_threshold, 0,
+ "Minimum size which forces the dynamic allocator to change it's allocation strategy");
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+extern int metaslab_df_free_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+ &metaslab_df_free_pct, 0,
+ "The minimum free space, in percent, which must be available in a "
+ "space map to continue allocations in a first-fit fashion");
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+extern int metaslab_load_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+ &metaslab_load_pct, 0,
+ "Percentage of cpus that can be used by the metaslab taskq");
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+extern int metaslab_preload_limit;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+ &metaslab_preload_limit, 0,
+ "Max number of metaslabs per group to preload");
+
+/* refcount.c */
+extern int reference_tracking_enable;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+ &reference_tracking_enable, 0,
+ "Track reference holders to refcount_t objects, used mostly by ZFS");
+
+/* spa.c */
+extern int zfs_ccw_retry_interval;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN,
+ &zfs_ccw_retry_interval, 0,
+ "Configuration cache file write, retry after failure, interval (seconds)");
+
+extern uint64_t zfs_max_missing_tvds_cachefile;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds_cachefile, 0,
+ "allow importing pools with missing top-level vdevs in cache file");
+
+extern uint64_t zfs_max_missing_tvds_scan;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds_scan, 0,
+ "allow importing pools with missing top-level vdevs during scan");
+
+/* spa_misc.c */
+extern int zfs_flags;
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+ int err, val;
+
+ val = zfs_flags;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ /*
+ * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+ * arc buffers in the system have the necessary additional
+ * checksum data. However, it is safe to disable at any
+ * time.
+ */
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ val &= ~ZFS_DEBUG_MODIFY;
+ zfs_flags = val;
+
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
+ sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+
+int
+param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long val;
+ int err;
+
+ val = zfs_deadman_synctime_ms;
+ err = sysctl_handle_long(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+ zfs_deadman_synctime_ms = val;
+
+ spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+ return (0);
+}
+
+int
+param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long val;
+ int err;
+
+ val = zfs_deadman_ziotime_ms;
+ err = sysctl_handle_long(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+ zfs_deadman_ziotime_ms = val;
+
+ spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+ return (0);
+}
+
+int
+param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
+{
+ char buf[16];
+ int rc;
+
+ if (req->newptr == NULL)
+ strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
+
+ rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (rc || req->newptr == NULL)
+ return (rc);
+ if (strcmp(buf, zfs_deadman_failmode) == 0)
+ return (0);
+ if (!strcmp(buf, "wait"))
+ zfs_deadman_failmode = "wait";
+ if (!strcmp(buf, "continue"))
+ zfs_deadman_failmode = "continue";
+ if (!strcmp(buf, "panic"))
+ zfs_deadman_failmode = "panic";
+
+ return (-param_set_deadman_failmode_common(buf));
+}
+
+
+/* spacemap.c */
+extern int space_map_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+ &space_map_ibs, 0, "Space map indirect block shift");
+
+
+/* vdev.c */
+#ifdef notyet
+extern uint64_t zfs_max_auto_ashift;
+extern uint64_t zfs_min_auto_ashift;
+
+static int
+sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_max_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val > ASHIFT_MAX || val < zfs_min_auto_ashift)
+ return (EINVAL);
+
+ zfs_max_auto_ashift = val;
+
+ return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t),
+ sysctl_vfs_zfs_max_auto_ashift, "QU",
+ "Max ashift used when optimising for logical -> physical sectors size on "
+ "new top-level vdevs.");
+static int
+sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_min_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < ASHIFT_MIN || val > zfs_max_auto_ashift)
+ return (EINVAL);
+
+ zfs_min_auto_ashift = val;
+
+ return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t),
+ sysctl_vfs_zfs_min_auto_ashift, "QU",
+ "Min ashift used when creating new top-level vdevs.");
+#endif
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+extern int zfs_vdev_dtl_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
+ &zfs_vdev_dtl_sm_blksz, 0,
+ "Block size for DTL space map. Power of 2 and greater than 4096.");
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+extern int zfs_vdev_standard_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
+ &zfs_vdev_standard_sm_blksz, 0,
+ "Block size for standard space map. Power of 2 and greater than 4096.");
+
+extern int vdev_validate_skip;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN,
+ &vdev_validate_skip, 0,
+ "Enable to bypass vdev_validate().");
+
+
+/* vdev_cache.c */
+
+/* vdev_mirror.c */
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+
+/* vdev_queue.c */
+#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
+extern uint32_t zfs_vdev_ ## name ## _min_active; \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
+ &zfs_vdev_ ## name ## _min_active, 0, \
+ "Initial number of I/O requests of type " #name \
+ " active for each device");
+
+#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
+extern uint32_t zfs_vdev_ ## name ## _max_active; \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \
+ &zfs_vdev_ ## name ## _max_active, 0, \
+ "Maximum number of I/O requests of type " #name \
+ " active for each device");
+
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
+extern uint32_t zfs_vdev_max_active;
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
+ &zfs_vdev_max_active, 0,
+ "The maximum number of I/Os of all types active for each device. (LEGACY)");
+
+extern int zfs_vdev_def_queue_depth;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
+ &zfs_vdev_def_queue_depth, 0,
+ "Default queue depth for each allocator");
+
+/*extern uint64_t zfs_multihost_history;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN,
+ &zfs_multihost_history, 0,
+ "Historical staticists for the last N multihost updates");*/
+
+#ifdef notyet
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
+ &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
+#endif
+
+
+/* zio.c */
+#if defined(__LP64__)
+int zio_use_uma = 1;
+#else
+int zio_use_uma = 0;
+#endif
+
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
+ "Use uma(9) for ZIO allocations");
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+ "Exclude metadata buffers from dumps as well");
+
+
+int
+param_set_arc_long(SYSCTL_HANDLER_ARGS)
+{
+ int err;
+
+ err = sysctl_handle_long(oidp, arg1, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ arc_tuning_update(B_TRUE);
+
+ return (0);
+}
+
+int
+param_set_arc_int(SYSCTL_HANDLER_ARGS)
+{
+ int err;
+
+ err = sysctl_handle_int(oidp, arg1, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ arc_tuning_update(B_TRUE);
+
+ return (0);
+}
+
+int
+param_set_slop_shift(SYSCTL_HANDLER_ARGS)
+{
+ int val;
+ int err;
+
+ val = *(int *)arg1;
+
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 1 || val > 31)
+ return (EINVAL);
+
+ *(int *)arg1 = val;
+
+ return (0);
+}
diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c
new file mode 100644
index 000000000..01851378e
--- /dev/null
+++ b/module/os/freebsd/zfs/vdev_file.c
@@ -0,0 +1,326 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/stat.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+void
+vdev_file_init(void)
+{
+ vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+ minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+ taskq_destroy(vdev_file_taskq);
+}
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+ mode_t mode = 0;
+
+ if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+ mode = O_RDWR;
+ } else if (spa_mode & SPA_MODE_READ) {
+ mode = O_RDONLY;
+ } else if (spa_mode & SPA_MODE_WRITE) {
+ mode = O_WRONLY;
+ }
+
+ return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ zfs_file_t *fp;
+ zfs_file_attr_t zfa;
+ int error;
+
+ /*
+ * Rotational optimizations only make sense on block devices.
+ */
+ vd->vdev_nonrot = B_TRUE;
+
+ /*
+ * Allow TRIM on file based vdevs. This may not always be supported,
+ * since it depends on your kernel version and underlying filesystem
+ * type but it is always safe to attempt.
+ */
+ vd->vdev_has_trim = B_TRUE;
+
+ /*
+ * Disable secure TRIM on file based vdevs. There is no way to
+ * request this behavior from the underlying filesystem.
+ */
+ vd->vdev_has_securetrim = B_FALSE;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ goto skip_open;
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+ error = zfs_file_open(vd->vdev_path,
+ vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_file = fp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (zfs_file_getattr(fp, &zfa)) {
+ return (SET_ERROR(ENODEV));
+ }
+ if (!S_ISREG(zfa.zfa_mode)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (SET_ERROR(ENODEV));
+ }
+#endif
+
+skip_open:
+
+ error = zfs_file_getattr(vf->vf_file, &zfa);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *max_psize = *psize = zfa.zfa_size;
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vd->vdev_reopening || vf == NULL)
+ return;
+
+ if (vf->vf_file != NULL) {
+ zfs_file_close(vf->vf_file);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
+{
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+ zio_t *zio = arg;
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf;
+ void *buf;
+ ssize_t resid;
+ loff_t off;
+ ssize_t size;
+ int err;
+
+ off = zio->io_offset;
+ size = zio->io_size;
+ resid = 0;
+
+ vf = vd->vdev_tsd;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+ err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+ abd_return_buf_copy(zio->io_abd, buf, size);
+ } else {
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+ abd_return_buf(zio->io_abd, buf, size);
+ }
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ zio->io_error = zfs_file_fsync(vf->vf_file,
+ O_SYNC|O_DSYNC);
+ break;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ zio_execute(zio);
+ return;
+ } else if (zio->io_type == ZIO_TYPE_TRIM) {
+#ifdef notyet
+ int mode = 0;
+
+ ASSERT3U(zio->io_size, !=, 0);
+
+ /* XXX FreeBSD has no fallocate routine in file ops */
+ zio->io_error = zfs_file_fallocate(vf->vf_file,
+ mode, zio->io_offset, zio->io_size);
+#endif
+ zio->io_error = SET_ERROR(ENOTSUP);
+ zio_execute(zio);
+ return;
+ }
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+ VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+ TQ_SLEEP), !=, 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ NULL,
+ vdev_file_hold,
+ vdev_file_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_FILE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ NULL,
+ vdev_file_hold,
+ vdev_file_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
new file mode 100644
index 000000000..d87bbbc18
--- /dev/null
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -0,0 +1,1195 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Portions Copyright (c) 2012 Martin Matuska <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+#include <geom/geom_disk.h>
+#include <geom/geom_int.h>
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+static g_attrchanged_t vdev_geom_attrchanged;
+struct g_class zfs_vdev_class = {
+ .name = "ZFS::VDEV",
+ .version = G_VERSION,
+ .attrchanged = vdev_geom_attrchanged,
+};
+
+struct consumer_vdev_elem {
+ SLIST_ENTRY(consumer_vdev_elem) elems;
+ vdev_t *vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+/* BEGIN CSTYLED */
+_Static_assert(sizeof (((struct g_consumer *)NULL)->private)
+ == sizeof (struct consumer_priv_t*),
+ "consumer_priv_t* can't be stored in g_consumer.private");
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
+static int vdev_geom_bio_flush_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
+ &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
+ &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+/* END CSTYLED */
+
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+ boolean_t do_null_update)
+{
+ boolean_t needs_update = B_FALSE;
+ char *physpath;
+ int error, physpath_len;
+
+ physpath_len = MAXPATHLEN;
+ physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+ error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+ if (error == 0) {
+ char *old_physpath;
+
+ /* g_topology lock ensures that vdev has not been closed */
+ g_topology_assert();
+ old_physpath = vd->vdev_physpath;
+ vd->vdev_physpath = spa_strdup(physpath);
+
+ if (old_physpath != NULL) {
+ needs_update = (strcmp(old_physpath,
+ vd->vdev_physpath) != 0);
+ spa_strfree(old_physpath);
+ } else
+ needs_update = do_null_update;
+ }
+ g_free(physpath);
+
+ /*
+ * If the physical path changed, update the config.
+ * Only request an update for previously unset physpaths if
+ * requested by the caller.
+ */
+ if (needs_update)
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vdev_t *vd = elem->vd;
+ if (strcmp(attr, "GEOM::physpath") == 0) {
+ vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
+ return;
+ }
+ }
+}
+
+static void
+vdev_geom_resize(struct g_consumer *cp)
+{
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ spa_t *spa;
+ vdev_t *vd;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vd = elem->vd;
+ if (vd->vdev_state != VDEV_STATE_HEALTHY)
+ continue;
+ spa = vd->vdev_spa;
+ if (!spa->spa_autoexpand)
+ continue;
+ vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
+ }
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+ struct consumer_priv_t *priv;
+ // cppcheck-suppress uninitvar
+ struct consumer_vdev_elem *elem;
+
+ g_topology_assert();
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ /* Vdev close in progress. Ignore the event. */
+ return;
+
+ /*
+ * Orphan callbacks occur from the GEOM event thread.
+ * Concurrent with this call, new I/O requests may be
+ * working their way through GEOM about to find out
+ * (only once executed by the g_down thread) that we've
+ * been orphaned from our disk provider. These I/Os
+ * must be retired before we can detach our consumer.
+ * This is most easily achieved by acquiring the
+ * SPA ZIO configuration lock as a writer, but doing
+ * so with the GEOM topology lock held would cause
+ * a lock order reversal. Instead, rely on the SPA's
+ * async removal support to invoke a close on this
+ * vdev once it is safe to do so.
+ */
+ // cppcheck-suppress All
+ SLIST_FOREACH(elem, priv, elems) {
+ // cppcheck-suppress uninitvar
+ vdev_t *vd = elem->vd;
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+ }
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+ int error;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+ if (sanity) {
+ if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible sectorsize %d\n",
+ pp->name, pp->sectorsize);
+ return (NULL);
+ } else if (pp->mediasize < SPA_MINDEVSIZE) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible mediasize %ju\n",
+ pp->name, pp->mediasize);
+ return (NULL);
+ }
+ }
+
+ /* Do we have geom already? No? Create one. */
+ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ if (strcmp(gp->name, "zfs::vdev") != 0)
+ continue;
+ break;
+ }
+ if (gp == NULL) {
+ gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+ gp->orphan = vdev_geom_orphan;
+ gp->attrchanged = vdev_geom_attrchanged;
+ gp->resize = vdev_geom_resize;
+ cp = g_new_consumer(gp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+ } else {
+ /* Check if we are already connected to this provider. */
+ LIST_FOREACH(cp, &gp->consumer, consumer) {
+ if (cp->provider == pp) {
+ ZFS_LOG(1, "Found consumer for %s.", pp->name);
+ break;
+ }
+ }
+ if (cp == NULL) {
+ cp = g_new_consumer(gp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created consumer for %s.", pp->name);
+ } else {
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+ }
+ }
+
+ if (vd != NULL)
+ vd->vdev_tsd = cp;
+
+ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ return (cp);
+}
+
+static void
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
+{
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Detaching from %s.",
+ cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
+ gp = cp->geom;
+ if (open_for_read)
+ g_access(cp, -1, 0, -1);
+ /* Destroy consumer on last close. */
+ if (cp->acr == 0 && cp->ace == 0) {
+ if (cp->acw > 0)
+ g_access(cp, 0, -cp->acw, 0);
+ if (cp->provider != NULL) {
+ ZFS_LOG(1, "Destroying consumer for %s.",
+ cp->provider->name ? cp->provider->name : "NULL");
+ g_detach(cp);
+ }
+ g_destroy_consumer(cp);
+ }
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(gp, ENXIO);
+ }
+}
+
+static void
+vdev_geom_close_locked(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem, *elem_temp;
+
+ g_topology_assert();
+
+ cp = vd->vdev_tsd;
+ vd->vdev_delayed_close = B_FALSE;
+ if (cp == NULL)
+ return;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+ priv = (struct consumer_priv_t *)&cp->private;
+ vd->vdev_tsd = NULL;
+ SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+ if (elem->vd == vd) {
+ SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+ g_free(elem);
+ }
+ }
+
+ vdev_geom_detach(cp, B_TRUE);
+}
+
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
+ * operation is described by parallel entries from each array. There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+ off_t *sizes, int *errors, int ncmds)
+{
+ struct bio **bios;
+ uint8_t *p;
+ off_t off, maxio, s, end;
+ int i, n_bios, j;
+ size_t bios_size;
+
+ maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
+ n_bios = 0;
+
+ /* How many bios are required for all commands ? */
+ for (i = 0; i < ncmds; i++)
+ n_bios += (sizes[i] + maxio - 1) / maxio;
+
+ /* Allocate memory for the bios */
+ bios_size = n_bios * sizeof (struct bio *);
+ bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+ /* Prepare and issue all of the bios */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ p = datas[i];
+ s = sizes[i];
+ end = off + s;
+ ASSERT((off % cp->provider->sectorsize) == 0);
+ ASSERT((s % cp->provider->sectorsize) == 0);
+
+ for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+ bios[j] = g_alloc_bio();
+ bios[j]->bio_cmd = cmds[i];
+ bios[j]->bio_done = NULL;
+ bios[j]->bio_offset = off;
+ bios[j]->bio_length = MIN(s, maxio);
+ bios[j]->bio_data = (caddr_t)p;
+ g_io_request(bios[j], cp);
+ }
+ }
+ ASSERT(j == n_bios);
+
+ /* Wait for all of the bios to complete, and clean them up */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ s = sizes[i];
+ end = off + s;
+
+ for (; off < end; off += maxio, s -= maxio, j++) {
+ errors[i] = biowait(bios[j], "vdev_geom_io") ||
+ errors[i];
+ g_destroy_bio(bios[j]);
+ }
+ }
+ kmem_free(bios, bios_size);
+}
+
+/*
+ * Read the vdev config from a device. Return the number of valid labels that
+ * were found. The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
+static int
+vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
+{
+ struct g_provider *pp;
+ nvlist_t *config;
+ vdev_phys_t *vdev_lists[VDEV_LABELS];
+ char *buf;
+ size_t buflen;
+ uint64_t psize, state, txg;
+ off_t offsets[VDEV_LABELS];
+ off_t size;
+ off_t sizes[VDEV_LABELS];
+ int cmds[VDEV_LABELS];
+ int errors[VDEV_LABELS];
+ int l, nlabels;
+
+ g_topology_assert_not();
+
+ pp = cp->provider;
+ ZFS_LOG(1, "Reading config from %s...", pp->name);
+
+ psize = pp->mediasize;
+ psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+ size = sizeof (*vdev_lists[0]) + pp->sectorsize -
+ ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
+
+ buflen = sizeof (vdev_lists[0]->vp_nvlist);
+
+ /* Create all of the IO requests */
+ for (l = 0; l < VDEV_LABELS; l++) {
+ cmds[l] = BIO_READ;
+ vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+ offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+ sizes[l] = size;
+ errors[l] = 0;
+ ASSERT(offsets[l] % pp->sectorsize == 0);
+ }
+
+ /* Issue the IO requests */
+ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+ VDEV_LABELS);
+
+ /* Parse the labels */
+ config = *configp = NULL;
+ nlabels = 0;
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (errors[l] != 0)
+ continue;
+
+ buf = vdev_lists[l]->vp_nvlist;
+
+ if (nvlist_unpack(buf, buflen, &config, 0) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state > POOL_STATE_L2CACHE) {
+ nvlist_free(config);
+ continue;
+ }
+
+ if (state != POOL_STATE_SPARE &&
+ state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0)) {
+ nvlist_free(config);
+ continue;
+ }
+
+ if (*configp != NULL)
+ nvlist_free(*configp);
+ *configp = config;
+ nlabels++;
+ }
+
+ /* Free the label storage */
+ for (l = 0; l < VDEV_LABELS; l++)
+ kmem_free(vdev_lists[l], size);
+
+ return (nlabels);
+}
+
+static void
+resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
+{
+ nvlist_t **new_configs;
+ uint64_t i;
+
+ if (id < *count)
+ return;
+ new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
+ KM_SLEEP);
+ for (i = 0; i < *count; i++)
+ new_configs[i] = (*configs)[i];
+ if (*configs != NULL)
+ kmem_free(*configs, *count * sizeof (void *));
+ *configs = new_configs;
+ *count = id + 1;
+}
+
+static void
+process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
+ const char *name, uint64_t *known_pool_guid)
+{
+ nvlist_t *vdev_tree;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+ uint64_t id, txg, known_txg;
+ char *pname;
+
+ if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
+ strcmp(pname, name) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
+ goto ignore;
+
+ VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ if (*known_pool_guid != 0) {
+ if (pool_guid != *known_pool_guid)
+ goto ignore;
+ } else
+ *known_pool_guid = pool_guid;
+
+ resize_configs(configs, count, id);
+
+ if ((*configs)[id] != NULL) {
+ VERIFY(nvlist_lookup_uint64((*configs)[id],
+ ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
+ if (txg <= known_txg)
+ goto ignore;
+ nvlist_free((*configs)[id]);
+ }
+
+ (*configs)[id] = cfg;
+ return;
+
+ignore:
+ nvlist_free(cfg);
+}
+
+int
+vdev_geom_read_pool_label(const char *name,
+ nvlist_t ***configs, uint64_t *count)
+{
+ struct g_class *mp;
+ struct g_geom *gp;
+ struct g_provider *pp;
+ struct g_consumer *zcp;
+ nvlist_t *vdev_cfg;
+ uint64_t pool_guid;
+ int nlabels;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ *configs = NULL;
+ *count = 0;
+ pool_guid = 0;
+ LIST_FOREACH(mp, &g_classes, class) {
+ if (mp == &zfs_vdev_class)
+ continue;
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ if (pp->flags & G_PF_WITHER)
+ continue;
+ zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (zcp == NULL)
+ continue;
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
+ g_topology_lock();
+ vdev_geom_detach(zcp, B_TRUE);
+ if (nlabels == 0)
+ continue;
+ ZFS_LOG(1, "successfully read vdev config");
+
+ process_vdev_config(configs, count,
+ vdev_cfg, name, &pool_guid);
+ }
+ }
+ }
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (*count > 0 ? 0 : ENOENT);
+}
+
+enum match {
+ NO_MATCH = 0, /* No matching labels found */
+ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */
+ ZERO_MATCH = 1, /* Should never be returned */
+ ONE_MATCH = 2, /* 1 label matching the vdev_guid */
+ TWO_MATCH = 3, /* 2 label matching the vdev_guid */
+ THREE_MATCH = 4, /* 3 label matching the vdev_guid */
+ FULL_MATCH = 5 /* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
+{
+ nvlist_t *config;
+ uint64_t pool_guid, top_guid, vdev_guid;
+ struct g_consumer *cp;
+ int nlabels;
+
+ cp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (cp == NULL) {
+ ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+ pp->name);
+ return (NO_MATCH);
+ }
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(cp, &config);
+ g_topology_lock();
+ vdev_geom_detach(cp, B_TRUE);
+ if (nlabels == 0) {
+ ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+ return (NO_MATCH);
+ }
+
+ pool_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+ top_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+ vdev_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ nvlist_free(config);
+
+ /*
+ * Check that the label's pool guid matches the desired guid.
+ * Inactive spares and L2ARCs do not have any pool guid in the label.
+ */
+ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+ ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+ pp->name,
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+ return (NO_MATCH);
+ }
+
+ /*
+ * Check that the label's vdev guid matches the desired guid.
+ * The second condition handles possible race on vdev detach, when
+ * remaining vdev receives GUID of destroyed top level mirror vdev.
+ */
+ if (vdev_guid == vd->vdev_guid) {
+ ZFS_LOG(1, "guids match for provider %s.", pp->name);
+ return (ZERO_MATCH + nlabels);
+ } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+ ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+ return (TOPGUID_MATCH);
+ }
+ ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+ pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+ return (NO_MATCH);
+}
+
+static struct g_consumer *
+vdev_geom_attach_by_guids(vdev_t *vd)
+{
+ struct g_class *mp;
+ struct g_geom *gp;
+ struct g_provider *pp, *best_pp;
+ struct g_consumer *cp;
+ const char *vdpath;
+ enum match match, best_match;
+
+ g_topology_assert();
+
+ vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
+ cp = NULL;
+ best_pp = NULL;
+ best_match = NO_MATCH;
+ LIST_FOREACH(mp, &g_classes, class) {
+ if (mp == &zfs_vdev_class)
+ continue;
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ match = vdev_attach_ok(vd, pp);
+ if (match > best_match) {
+ best_match = match;
+ best_pp = pp;
+ } else if (match == best_match) {
+ if (strcmp(pp->name, vdpath) == 0) {
+ best_pp = pp;
+ }
+ }
+ if (match == FULL_MATCH)
+ goto out;
+ }
+ }
+ }
+
+out:
+ if (best_pp) {
+ cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+ if (cp == NULL) {
+ printf("ZFS WARNING: Unable to attach to %s.\n",
+ best_pp->name);
+ }
+ }
+ return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_guids(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ char *buf;
+ size_t len;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+ cp = vdev_geom_attach_by_guids(vd);
+ if (cp != NULL) {
+ len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+ buf = kmem_alloc(len, KM_SLEEP);
+
+ snprintf(buf, len, "/dev/%s", cp->provider->name);
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+
+ ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid, cp->provider->name);
+ } else {
+ ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid);
+ }
+
+ return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_path(vdev_t *vd, int check_guid)
+{
+ struct g_provider *pp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+
+ cp = NULL;
+ pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
+ if (pp != NULL) {
+ ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
+ if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+ cp = vdev_geom_attach(pp, vd, B_FALSE);
+ }
+
+ return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift)
+{
+ struct g_provider *pp;
+ struct g_consumer *cp;
+ int error, has_trim;
+ uint16_t rate;
+
+ /*
+ * Set the TLS to indicate downstack that we
+ * should not access zvols
+ */
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if ((cp = vd->vdev_tsd) != NULL) {
+ ASSERT(vd->vdev_reopening);
+ goto skip_open;
+ }
+
+ DROP_GIANT();
+ g_topology_lock();
+ error = 0;
+
+ if (vd->vdev_spa->spa_is_splitting ||
+ ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+ (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+ vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
+ /*
+ * We are dealing with a vdev that hasn't been previously
+ * opened (since boot), and we are not loading an
+ * existing pool configuration. This looks like a
+ * vdev add operation to a new or existing pool.
+ * Assume the user knows what he/she is doing and find
+ * GEOM provider by its name, ignoring GUID mismatches.
+ *
+ * XXPOLICY: It would be safer to only allow a device
+ * that is unlabeled or labeled but missing
+ * GUID information to be opened in this fashion,
+ * unless we are doing a split, in which case we
+ * should allow any guid.
+ */
+ cp = vdev_geom_open_by_path(vd, 0);
+ } else {
+ /*
+ * Try using the recorded path for this device, but only
+ * accept it if its label data contains the expected GUIDs.
+ */
+ cp = vdev_geom_open_by_path(vd, 1);
+ if (cp == NULL) {
+ /*
+ * The device at vd->vdev_path doesn't have the
+ * expected GUIDs. The disks might have merely
+ * moved around so try all other GEOM providers
+ * to find one with the right GUIDs.
+ */
+ cp = vdev_geom_open_by_guids(vd);
+ }
+ }
+
+ /* Clear the TLS now that tasting is done */
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
+ if (cp == NULL) {
+ ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
+ error = ENOENT;
+ } else {
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ int spamode;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (cp->private == NULL)
+ SLIST_INIT(priv);
+ elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
+ elem->vd = vd;
+ SLIST_INSERT_HEAD(priv, elem, elems);
+
+ spamode = spa_mode(vd->vdev_spa);
+ if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+ !ISP2(cp->provider->sectorsize)) {
+ ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+ cp->provider->name);
+
+ vdev_geom_close_locked(vd);
+ error = EINVAL;
+ cp = NULL;
+ } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+ int i;
+
+ for (i = 0; i < 5; i++) {
+ error = g_access(cp, 0, 1, 0);
+ if (error == 0)
+ break;
+ g_topology_unlock();
+ tsleep(vd, 0, "vdev", hz / 2);
+ g_topology_lock();
+ }
+ if (error != 0) {
+ printf("ZFS WARNING: Unable to open %s for "
+ "writing (error=%d).\n",
+ cp->provider->name, error);
+ vdev_geom_close_locked(vd);
+ cp = NULL;
+ }
+ }
+ }
+
+ /* Fetch initial physical path information for this device. */
+ if (cp != NULL) {
+ vdev_geom_attrchanged(cp, "GEOM::physpath");
+
+ /* Set other GEOM characteristics */
+ vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
+ }
+
+ g_topology_unlock();
+ PICKUP_GIANT();
+ if (cp == NULL) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
+ error);
+ return (error);
+ }
+skip_open:
+ pp = cp->provider;
+
+ /*
+ * Determine the actual size of the device.
+ */
+ *max_psize = *psize = pp->mediasize;
+
+ /*
+ * Determine the device's minimum transfer size and preferred
+ * transfer size.
+ */
+ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+#ifdef notyet
+ if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
+ pp->stripesize <= (1 << ASHIFT_MAX) && pp->stripeoffset == 0)
+ *physical_ashift = highbit(pp->stripesize) - 1;
+#endif
+ /*
+ * Clear the nowritecache settings, so that on a vdev_reopen()
+ * we will try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ /* Inform the ZIO pipeline that we are non-rotational. */
+ error = g_getattr("GEOM::rotation_rate", cp, &rate);
+ if (error == 0 && rate == DISK_RR_NON_ROTATING)
+ vd->vdev_nonrot = B_TRUE;
+ else
+ vd->vdev_nonrot = B_FALSE;
+
+ /* Set when device reports it supports TRIM. */
+ error = g_getattr("GEOM::candelete", cp, &has_trim);
+ vd->vdev_has_trim = (error == 0 && has_trim);
+
+ /* Set when device reports it supports secure TRIM. */
+ /* unavailable on FreeBSD */
+ vd->vdev_has_securetrim = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+ struct g_consumer *cp;
+
+ cp = vd->vdev_tsd;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if (!vd->vdev_reopening ||
+ (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+ (cp->provider != NULL && cp->provider->error != 0))))
+ vdev_geom_close_locked(vd);
+
+ g_topology_unlock();
+ PICKUP_GIANT();
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+ vdev_t *vd;
+ zio_t *zio;
+
+ zio = bp->bio_caller1;
+ vd = zio->io_vd;
+ zio->io_error = bp->bio_error;
+ if (zio->io_error == 0 && bp->bio_resid != 0)
+ zio->io_error = SET_ERROR(EIO);
+
+ switch (zio->io_error) {
+ case ENOTSUP:
+ /*
+ * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+ * that future attempts will never succeed. In this case
+ * we set a persistent flag so that we don't bother with
+ * requests in the future.
+ */
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ vd->vdev_nowritecache = B_TRUE;
+ break;
+ case BIO_DELETE:
+ break;
+ }
+ break;
+ case ENXIO:
+ if (!vd->vdev_remove_wanted) {
+ /*
+ * If provider's error is set we assume it is being
+ * removed.
+ */
+ if (bp->bio_to->error != 0) {
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa,
+ SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
+ }
+ break;
+ }
+
+ /*
+ * We have to split bio freeing into two parts, because the ABD code
+ * cannot be called in this context and vdev_op_io_done is not called
+ * for ZIO_TYPE_IOCTL zio-s.
+ */
+ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+ g_destroy_bio(bp);
+ zio->io_bio = NULL;
+ }
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+ vdev_t *vd;
+ struct g_consumer *cp;
+ struct bio *bp;
+
+ vd = zio->io_vd;
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ } else {
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ if (zfs_nocacheflush ||
+ vdev_geom_bio_flush_disable)
+ break;
+ if (vd->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+ goto sendreq;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+ }
+
+ zio_execute(zio);
+ return;
+ case ZIO_TYPE_TRIM:
+ if (!vdev_geom_bio_delete_disable) {
+ goto sendreq;
+ }
+ zio_execute(zio);
+ return;
+ default:
+ ;
+ /* PASSTHROUGH --- placate compiler */
+ }
+sendreq:
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_TRIM ||
+ zio->io_type == ZIO_TYPE_IOCTL);
+
+ cp = vd->vdev_tsd;
+ if (cp == NULL) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+ bp = g_alloc_bio();
+ bp->bio_caller1 = zio;
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ if (zio->io_type == ZIO_TYPE_READ) {
+ bp->bio_cmd = BIO_READ;
+ bp->bio_data =
+ abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ bp->bio_cmd = BIO_WRITE;
+ bp->bio_data =
+ abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+ break;
+ case ZIO_TYPE_TRIM:
+ bp->bio_cmd = BIO_DELETE;
+ bp->bio_data = NULL;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
+ case ZIO_TYPE_IOCTL:
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_flags |= BIO_ORDERED;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
+ break;
+ default:
+ panic("invalid zio->io_type: %d\n", zio->io_type);
+ }
+ bp->bio_done = vdev_geom_io_intr;
+ zio->io_bio = bp;
+
+ g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+ struct bio *bp = zio->io_bio;
+
+ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+ ASSERT(bp == NULL);
+ return;
+ }
+
+ if (bp == NULL) {
+ ASSERT3S(zio->io_error, ==, ENXIO);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
+ else
+ abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
+
+ g_destroy_bio(bp);
+ zio->io_bio = NULL;
+}
+
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_geom_open,
+ vdev_geom_close,
+ vdev_default_asize,
+ vdev_geom_io_start,
+ vdev_geom_io_done,
+ NULL,
+ NULL,
+ vdev_geom_hold,
+ vdev_geom_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c
new file mode 100644
index 000000000..e734a2af8
--- /dev/null
+++ b/module/os/freebsd/zfs/vdev_label_os.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+ zio_t *zio;
+ abd_t *pad2;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error;
+
+ if (size > VDEV_PAD_SIZE)
+ return (EINVAL);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (ENODEV);
+ if (vdev_is_dead(vd))
+ return (ENXIO);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(pad2, VDEV_PAD_SIZE);
+ abd_copy_from_buf(pad2, buf, size);
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ vdev_label_write(zio, vd, 0, pad2,
+ offsetof(vdev_label_t, vl_pad2),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ abd_free(pad2);
+ return (error);
+}
diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c
new file mode 100644
index 000000000..c11e16437
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_acl.c
@@ -0,0 +1,2738 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <acl/acl_common.h>
+
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define MIN_ACE_TYPE ALLOW
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+ ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+ ZFS_ACL_PROTECTED)
+
+#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+ ZFS_ACL_OBJ_ACE)
+
+#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+ ((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+ ((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+ return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+ *datap = NULL;
+ return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+ zfs_ace_v0_get_mask,
+ zfs_ace_v0_set_mask,
+ zfs_ace_v0_get_flags,
+ zfs_ace_v0_set_flags,
+ zfs_ace_v0_get_type,
+ zfs_ace_v0_set_type,
+ zfs_ace_v0_get_who,
+ zfs_ace_v0_set_who,
+ zfs_ace_v0_size,
+ zfs_ace_v0_abstract_size,
+ zfs_ace_v0_mask_off,
+ zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+ uint16_t entry_type;
+ zfs_ace_t *acep = args;
+
+ entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (-1);
+ return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+ ((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+ zfs_ace_t *acep = arg;
+
+ uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return;
+ acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+ zfs_ace_hdr_t *zacep = acep;
+ uint16_t entry_type;
+
+ switch (zacep->z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ return (sizeof (zfs_object_ace_t));
+ case ALLOW:
+ case DENY:
+ entry_type =
+ (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (sizeof (zfs_ace_hdr_t));
+ /*FALLTHROUGH*/
+ default:
+ return (sizeof (zfs_ace_t));
+ }
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+ return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+ return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+ zfs_ace_t *zacep = acep;
+ zfs_object_ace_t *zobjp;
+
+ switch (zacep->z_hdr.z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjp = acep;
+ *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+ return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+ default:
+ *datap = NULL;
+ return (0);
+ }
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+ zfs_ace_fuid_get_mask,
+ zfs_ace_fuid_set_mask,
+ zfs_ace_fuid_get_flags,
+ zfs_ace_fuid_set_flags,
+ zfs_ace_fuid_get_type,
+ zfs_ace_fuid_set_type,
+ zfs_ace_fuid_get_who,
+ zfs_ace_fuid_set_who,
+ zfs_ace_fuid_size,
+ zfs_ace_fuid_abstract_size,
+ zfs_ace_fuid_mask_off,
+ zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
+static int
+zfs_acl_version(int version)
+{
+ if (version < ZPL_VERSION_FUID)
+ return (ZFS_ACL_VERSION_INITIAL);
+ else
+ return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+ return (zfs_acl_version(zp->z_zfsvfs->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+ offsetof(zfs_acl_node_t, z_next));
+ aclp->z_version = vers;
+ if (vers == ZFS_ACL_VERSION_FUID)
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ else
+ aclp->z_ops = &zfs_acl_v0_ops;
+ return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+ zfs_acl_node_t *aclnode;
+
+ aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+ if (bytes) {
+ aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+ aclnode->z_allocdata = aclnode->z_acldata;
+ aclnode->z_allocsize = bytes;
+ aclnode->z_size = bytes;
+ }
+
+ return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+ if (aclnode->z_allocsize)
+ kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+ kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+ zfs_acl_node_t *aclnode;
+
+ while ((aclnode = list_head(&aclp->z_acl))) {
+ list_remove(&aclp->z_acl, aclnode);
+ zfs_acl_node_free(aclnode);
+ }
+ aclp->z_acl_count = 0;
+ aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ zfs_acl_release_nodes(aclp);
+ list_destroy(&aclp->z_acl);
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+ uint16_t entry_type;
+
+ switch (type) {
+ case ALLOW:
+ case DENY:
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry_type = flags & ACE_TYPE_FLAGS;
+ return (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE || entry_type == 0 ||
+ entry_type == ACE_IDENTIFIER_GROUP);
+ default:
+ if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+ /*
+ * first check type of entry
+ */
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ return (B_FALSE);
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+ return (B_FALSE);
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (obj_type == VDIR &&
+ (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+ uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+ zfs_acl_node_t *aclnode;
+
+ ASSERT(aclp);
+
+ if (start == NULL) {
+ aclnode = list_head(&aclp->z_acl);
+ if (aclnode == NULL)
+ return (NULL);
+
+ aclp->z_next_ace = aclnode->z_acldata;
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ }
+
+ aclnode = aclp->z_curr_node;
+
+ if (aclnode == NULL)
+ return (NULL);
+
+ if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+ aclnode = list_next(&aclp->z_acl, aclnode);
+ if (aclnode == NULL)
+ return (NULL);
+ else {
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ aclp->z_next_ace = aclnode->z_acldata;
+ }
+ }
+
+ if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+ void *acep = aclp->z_next_ace;
+ size_t ace_size;
+
+ /*
+ * Make sure we don't overstep our bounds
+ */
+ ace_size = aclp->z_ops->ace_size(acep);
+
+ if (((caddr_t)acep + ace_size) >
+ ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+ return (NULL);
+ }
+
+ *iflags = aclp->z_ops->ace_flags_get(acep);
+ *type = aclp->z_ops->ace_type_get(acep);
+ *access_mask = aclp->z_ops->ace_mask_get(acep);
+ *who = aclp->z_ops->ace_who_get(acep);
+ aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+ aclnode->z_ace_idx++;
+
+ return ((void *)acep);
+ }
+ return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+ uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+ zfs_acl_t *aclp = datap;
+ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+ uint64_t who;
+
+ acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+ flags, type);
+ return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+ zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+ int i;
+ uint16_t entry_type;
+ zfs_ace_t *aceptr = z_acl;
+ ace_t *acep = datap;
+ zfs_object_ace_t *zobjacep;
+ ace_object_t *aceobjp;
+
+ for (i = 0; i != aclcnt; i++) {
+ aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+ aceptr->z_hdr.z_flags = acep->a_flags;
+ aceptr->z_hdr.z_type = acep->a_type;
+ entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE) {
+ aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+ cr, (entry_type == 0) ?
+ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+ }
+
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
+ aceptr->z_hdr.z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+
+ switch (acep->a_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjacep = (zfs_object_ace_t *)aceptr;
+ aceobjp = (ace_object_t *)acep;
+
+ bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+ sizeof (aceobjp->a_obj_type));
+ bcopy(aceobjp->a_inherit_obj_type,
+ zobjacep->z_inherit_type,
+ sizeof (aceobjp->a_inherit_obj_type));
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+ break;
+ default:
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+ }
+
+ aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+ aclp->z_ops->ace_size(aceptr));
+ }
+
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+ return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+ void *datap, int filter)
+{
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, type;
+ zfs_ace_hdr_t *zacep = NULL;
+ ace_t *acep = datap;
+ ace_object_t *objacep;
+ zfs_object_ace_t *zobjacep;
+ size_t ace_size;
+ uint16_t entry_type;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (filter) {
+ continue;
+ }
+ zobjacep = (zfs_object_ace_t *)zacep;
+ objacep = (ace_object_t *)acep;
+ bcopy(zobjacep->z_object_type,
+ objacep->a_obj_type,
+ sizeof (zobjacep->z_object_type));
+ bcopy(zobjacep->z_inherit_type,
+ objacep->a_inherit_obj_type,
+ sizeof (zobjacep->z_inherit_type));
+ ace_size = sizeof (ace_object_t);
+ break;
+ default:
+ ace_size = sizeof (ace_t);
+ break;
+ }
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ if ((entry_type != ACE_OWNER &&
+ entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE)) {
+ acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+ cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+ ZFS_ACE_GROUP : ZFS_ACE_USER);
+ } else {
+ acep->a_who = (uid_t)(int64_t)who;
+ }
+ acep->a_access_mask = access_mask;
+ acep->a_flags = iflags;
+ acep->a_type = type;
+ acep = (ace_t *)((caddr_t)acep + ace_size);
+ }
+}
+
+static int
+zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
+ zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+ int i;
+ zfs_oldace_t *aceptr = z_acl;
+
+ for (i = 0; i != aclcnt; i++, aceptr++) {
+ aceptr->z_access_mask = acep[i].a_access_mask;
+ aceptr->z_type = acep[i].a_type;
+ aceptr->z_flags = acep[i].a_flags;
+ aceptr->z_fuid = acep[i].a_who;
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
+ aceptr->z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+ }
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+ return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+ zfs_oldace_t *oldaclp;
+ int i;
+ uint16_t type, iflags;
+ uint32_t access_mask;
+ uint64_t who;
+ void *cookie = NULL;
+ zfs_acl_node_t *newaclnode;
+
+ ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+ /*
+ * First create the ACE in a contiguous piece of memory
+ * for zfs_copy_ace_2_fuid().
+ *
+ * We only convert an ACL once, so this won't happen
+ * everytime.
+ */
+ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+ KM_SLEEP);
+ i = 0;
+ while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+ &access_mask, &iflags, &type))) {
+ oldaclp[i].z_flags = iflags;
+ oldaclp[i].z_type = type;
+ oldaclp[i].z_fuid = who;
+ oldaclp[i++].z_access_mask = access_mask;
+ }
+
+ newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+ sizeof (zfs_object_ace_t));
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+ oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+ &newaclnode->z_size, NULL, cr) == 0);
+ newaclnode->z_ace_count = aclp->z_acl_count;
+ aclp->z_version = ZFS_ACL_VERSION;
+ kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+ /*
+ * Release all previous ACL nodes
+ */
+
+ zfs_acl_release_nodes(aclp);
+
+ list_insert_head(&aclp->z_acl, newaclnode);
+
+ aclp->z_acl_bytes = newaclnode->z_size;
+ aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & S_IXOTH)
+ new_mask |= ACE_EXECUTE;
+ if (access_mask & S_IWOTH)
+ new_mask |= ACE_WRITE_DATA;
+ if (access_mask & S_IROTH)
+ new_mask |= ACE_READ_DATA;
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+ uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+ uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+ aclp->z_ops->ace_mask_set(acep, access_mask);
+ aclp->z_ops->ace_type_set(acep, access_type);
+ aclp->z_ops->ace_flags_set(acep, entry_type);
+ if ((type != ACE_OWNER && type != OWNING_GROUP &&
+ type != ACE_EVERYONE))
+ aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+ int entry_type;
+ mode_t mode;
+ mode_t seen = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ uint64_t who;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
+
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+ &access_mask, &iflags, &type))) {
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ /*
+ * Skip over any inherit_only ACEs
+ */
+ if (iflags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
+ }
+ }
+
+ /*
+ * Failure to allow is effectively a deny, so execute permission
+ * is denied if it was never mentioned or if we explicitly
+ * weren't allowed it.
+ */
+ if (!an_exec_denied &&
+ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ *pflags |= ZFS_NO_EXECS_DENIED;
+
+ return (mode);
+}
+
+/*
+ * Read an external acl object. If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
+{
+ zfs_acl_t *aclp;
+ int aclsize;
+ int acl_count;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+
+ if (zp->z_acl_cached && !will_modify) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
+ version = zfs_znode_acl_version(zp);
+
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
+ }
+
+ aclp = zfs_acl_alloc(version);
+
+ aclp->z_acl_count = acl_count;
+ aclp->z_acl_bytes = aclsize;
+
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(zp->z_zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ goto done;
+ }
+
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ *aclpp = aclp;
+ if (!will_modify)
+ zp->z_acl_cached = aclp;
+done:
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+ zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, zp->z_uid, zp->z_gid);
+ return (error);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+ int error;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+ zfs_acl_phys_t acl_phys;
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ zp->z_uid, zp->z_gid);
+
+ zp->z_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ /*
+ * Upgrade needed?
+ */
+ if (!zfsvfs->z_use_fuids) {
+ otype = DMU_OT_OLDACL;
+ } else {
+ if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+ (zfsvfs->z_version >= ZPL_VERSION_FUID))
+ zfs_acl_xform(zp, aclp, cr);
+ ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+ otype = DMU_OT_ACL;
+ }
+
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_OLD_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
+ } else {
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ }
+ /*
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
+ */
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
+ }
+ acl_phys.z_acl_version = aclp->z_version;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
+ }
+
+ /*
+ * Replace ACL wide bits, but first clear them.
+ */
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+ zp->z_pflags |= aclp->z_hints;
+
+ if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
+ zfs_acl_t *aclp)
+{
+ void *acep = NULL;
+ uint64_t who;
+ int new_count, new_bytes;
+ int ace_size;
+ int entry_type;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops->ace_abstract_size();
+ void *zacep;
+ boolean_t isdir;
+ trivial_acl_t masks;
+
+ new_count = new_bytes = 0;
+
+ isdir = (vtype == VDIR);
+
+ acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (masks.allow0) {
+ zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny1) {
+ zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny2) {
+ zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ /*
+ * ACEs used to represent the file mode may be divided
+ * into an equivalent pair of inherit-only and regular
+ * ACEs, if they are inheritable.
+ * Skip regular ACEs, which are replaced by the new mode.
+ */
+ if (split && (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)) {
+ if (!isdir || !(iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ continue;
+ /*
+ * We preserve owner@, group@, or @everyone
+ * permissions, if they are inheritable, by
+ * copying them to inherit_only ACEs. This
+ * prevents inheritable permissions from being
+ * altered along with the file mode.
+ */
+ iflags |= ACE_INHERIT_ONLY_ACE;
+ }
+
+ /*
+ * If this ACL has any inheritable ACEs, mark that in
+ * the hints (which are later masked into the pflags)
+ * so create knows to do inheritance.
+ */
+ if (isdir && (iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if ((type != ALLOW && type != DENY) ||
+ (iflags & ACE_INHERIT_ONLY_ACE)) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ break;
+ }
+ } else {
+ /*
+ * Limit permissions granted by ACEs to be no greater
+ * than permissions of the requested group mode.
+ * Applies when the "aclmode" property is set to
+ * "groupmask".
+ */
+ if ((type == ALLOW) && trim)
+ access_mask &= masks.group;
+ }
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops->ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+ int error = 0;
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ else
+ error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+ if (error == 0) {
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
+ (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
+{
+ int iflags = (acep_flags & 0xf);
+
+ if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!((vtype == VDIR) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+ uint64_t mode, boolean_t *need_chmod)
+{
+ void *pacep = NULL;
+ void *acep;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_t *aclp = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, newflags, type;
+ size_t ace_size;
+ void *data1, *data2;
+ size_t data1sz, data2sz;
+ uint_t aclinherit;
+ boolean_t isdir = (vtype == VDIR);
+ boolean_t isreg = (vtype == VREG);
+
+ *need_chmod = B_TRUE;
+
+ aclp = zfs_acl_alloc(paclp->z_version);
+ aclinherit = zfsvfs->z_acl_inherit;
+ if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
+ return (aclp);
+
+ while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+ &access_mask, &iflags, &type))) {
+
+ /*
+ * don't inherit bogus ACEs
+ */
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ /*
+ * Check if ACE is inheritable by this vnode
+ */
+ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+ !zfs_ace_can_use(vtype, iflags))
+ continue;
+
+ /*
+ * If owner@, group@, or everyone@ inheritable
+ * then zfs_acl_chmod() isn't needed.
+ */
+ if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+ aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+ ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+ ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+ (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+ *need_chmod = B_FALSE;
+
+ /*
+ * Strip inherited execute permission from file if
+ * not in mode
+ */
+ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+ !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
+ }
+
+ /*
+ * Strip write_acl and write_owner from permissions
+ * when inheriting an ACE
+ */
+ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+ access_mask &= ~RESTRICTED_CLEAR;
+ }
+
+ ace_size = aclp->z_ops->ace_size(pacep);
+ aclnode = zfs_acl_node_alloc(ace_size);
+ list_insert_tail(&aclp->z_acl, aclnode);
+ acep = aclnode->z_acldata;
+
+ zfs_set_ace(aclp, acep, access_mask, type,
+ who, iflags|ACE_INHERITED_ACE);
+
+ /*
+ * Copy special opaque data if any
+ */
+ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+ VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+ &data2)) == data1sz);
+ bcopy(data1, data2, data2sz);
+ }
+
+ aclp->z_acl_count++;
+ aclnode->z_ace_count++;
+ aclp->z_acl_bytes += aclnode->z_size;
+ newflags = aclp->z_ops->ace_flags_get(acep);
+
+ /*
+ * If ACE is not to be inherited further, or if the vnode is
+ * not a directory, remove all inheritance flags
+ */
+ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ newflags &= ~ALL_INHERIT;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ continue;
+ }
+
+ /*
+ * This directory has an inheritable ACE
+ */
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+ newflags |= ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ } else {
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ }
+ }
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+ int error;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_acl_t *paclp;
+ gid_t gid;
+ boolean_t need_chmod = B_TRUE;
+ boolean_t trim = B_FALSE;
+ boolean_t inherited = B_FALSE;
+
+ if ((flag & IS_ROOT_NODE) == 0) {
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ } else
+ ASSERT(dzp->z_vnode == NULL);
+ bzero(acl_ids, sizeof (zfs_acl_ids_t));
+ acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ if (vsecp)
+ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+ &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+ return (error);
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+ ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+ acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr,
+ ZFS_OWNER, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid, cr,
+ ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ } else {
+ acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+ cr, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = 0;
+ if (vap->va_mask & AT_GID) {
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ if (acl_ids->z_fgid != dzp->z_gid &&
+ !groupmember(vap->va_gid, cr) &&
+ secpolicy_vnode_create_gid(cr) != 0)
+ acl_ids->z_fgid = 0;
+ }
+ if (acl_ids->z_fgid == 0) {
+ if (dzp->z_mode & S_ISGID) {
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = dzp->z_gid;
+ gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+ cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain = zfs_fuid_idx_domain(
+ &zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid,
+ FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
+ } else {
+ acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+ ZFS_GROUP, cr, &acl_ids->z_fuidp);
+#ifdef __FreeBSD_kernel__
+ gid = acl_ids->z_fgid = dzp->z_gid;
+#else
+ gid = crgetgid(cr);
+#endif
+ }
+ }
+ }
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+ (vap->va_type == VDIR)) {
+ acl_ids->z_mode |= S_ISGID;
+ } else {
+ if ((acl_ids->z_mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
+ acl_ids->z_mode &= ~S_ISGID;
+ }
+
+ if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
+ if (!(flag & IS_ROOT_NODE) &&
+ (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
+ acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+ vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
+ } else {
+ acl_ids->z_aclp =
+ zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+ mutex_exit(&dzp->z_acl_lock);
+
+ if (need_chmod) {
+ if (vap->va_type == VDIR)
+ acl_ids->z_aclp->z_hints |=
+ ZFS_ACL_AUTO_INHERIT;
+
+ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+ trim = B_TRUE;
+ zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
+ trim, acl_ids->z_aclp);
+ }
+ }
+
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
+ return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+ if (acl_ids->z_aclp)
+ zfs_acl_free(acl_ids->z_aclp);
+ if (acl_ids->z_fuidp)
+ zfs_fuid_info_free(acl_ids->z_fuidp);
+ acl_ids->z_aclp = NULL;
+ acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+ return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+ zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+ (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask;
+ int error;
+ int count = 0;
+ int largeace = 0;
+
+ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+ VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ /*
+ * Scan ACL to determine number of ACEs
+ */
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+ void *zacep = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t type, iflags;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ largeace++;
+ continue;
+ default:
+ count++;
+ }
+ }
+ vsecp->vsa_aclcnt = count;
+ } else
+ count = (int)aclp->z_acl_count;
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = count;
+ }
+
+ if (mask & VSA_ACE) {
+ size_t aclsz;
+
+ aclsz = count * sizeof (ace_t) +
+ sizeof (ace_object_t) * largeace;
+
+ vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+ vsecp->vsa_aclentsz = aclsz;
+
+ if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+ zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
+ vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+ else {
+ zfs_acl_node_t *aclnode;
+ void *start = vsecp->vsa_aclentp;
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+ aclp->z_acl_bytes);
+ }
+ }
+ if (mask & VSA_ACE_ACLFLAGS) {
+ vsecp->vsa_aclflags = 0;
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+ vsecp->vsa_aclflags |= ACL_DEFAULTED;
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
+ vsecp->vsa_aclflags |= ACL_PROTECTED;
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+ vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type,
+ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+ zfs_acl_t *aclp;
+ zfs_acl_node_t *aclnode;
+ int aclcnt = vsecp->vsa_aclcnt;
+ int error;
+
+ if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+ return (SET_ERROR(EINVAL));
+
+ aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+ aclp->z_hints = 0;
+ aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
+ (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+ aclcnt, &aclnode->z_size)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ } else {
+ if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
+ vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+ &aclnode->z_size, fuidp, cr)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ }
+ aclp->z_acl_bytes = aclnode->z_size;
+ aclnode->z_ace_count = aclcnt;
+ aclp->z_acl_count = aclcnt;
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ /*
+ * If flags are being set then add them to z_hints
+ */
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+ if (vsecp->vsa_aclflags & ACL_PROTECTED)
+ aclp->z_hints |= ZFS_ACL_PROTECTED;
+ if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+ aclp->z_hints |= ZFS_ACL_DEFAULTED;
+ if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+ aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+ }
+
+ *zaclp = aclp;
+
+ return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_t *aclp;
+ zfs_fuid_info_t *fuidp = NULL;
+ boolean_t fuid_dirtied;
+ uint64_t acl_obj;
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ return (SET_ERROR(EPERM));
+
+ if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+ &aclp);
+ if (error)
+ return (error);
+
+ /*
+ * If ACL wide flags aren't being set then preserve any
+ * existing flags.
+ */
+ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+ }
+top:
+ mutex_enter(&zp->z_acl_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+ }
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ error = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
+ zp->z_acl_cached = aclp;
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+ if (fuidp)
+ zfs_fuid_info_free(fuidp);
+ dmu_tx_commit(tx);
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only. Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+ if ((v4_mode & WRITE_MASK) &&
+ (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ (!IS_DEVVP(ZTOV(zp)) ||
+ (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common().
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ (zp->z_pflags & ZFS_IMMUTABLE)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
+ * (sunlnk) is set. We just don't allow directory removal, which is
+ * handled in zfs_zaccess_delete().
+ */
+ if ((v4_mode & ACE_DELETE) &&
+ (zp->z_pflags & ZFS_NOUNLINK)) {
+ return (EPERM);
+ }
+
+ if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+ return (SET_ERROR(EACCES));
+ }
+
+ return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied. The AoI are expressed as bits in
+ * the working_mode parameter. As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode. This removal
+ * facilitates two things. The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE. Removing the
+ * discovered access or denial enforces this rule. At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses. Returns:
+ * 0 if all AoI granted
+ * EACCESS if the denied mask is non-zero
+ * other error if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted. If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE. The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+ boolean_t anyaccess, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_acl_t *aclp;
+ int error;
+ uid_t uid = crgetuid(cr);
+ uint64_t who;
+ uint16_t type, iflags;
+ uint16_t entry_type;
+ uint32_t access_mask;
+ uint32_t deny_mask = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ boolean_t checkit;
+ uid_t gowner;
+ uid_t fowner;
+
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ if (zp->z_zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ ASSERT(zp->z_acl_cached);
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ uint32_t mask_matched;
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
+ continue;
+
+ /* Skip ACE if it does not affect any AoI */
+ mask_matched = (access_mask & *working_mode);
+ if (!mask_matched)
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ checkit = B_FALSE;
+
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == fowner)
+ checkit = B_TRUE;
+ break;
+ case OWNING_GROUP:
+ who = gowner;
+ /*FALLTHROUGH*/
+ case ACE_IDENTIFIER_GROUP:
+ checkit = zfs_groupmember(zfsvfs, who, cr);
+ break;
+ case ACE_EVERYONE:
+ checkit = B_TRUE;
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ uid_t newid;
+
+ newid = zfs_fuid_map_id(zfsvfs, who, cr,
+ ZFS_ACE_USER);
+ if (newid != UID_NOBODY &&
+ uid == newid)
+ checkit = B_TRUE;
+ break;
+ } else {
+ mutex_exit(&zp->z_acl_lock);
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ if (checkit) {
+ if (type == DENY) {
+ DTRACE_PROBE3(zfs__ace__denies,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ deny_mask |= mask_matched;
+ } else {
+ DTRACE_PROBE3(zfs__ace__allows,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ if (anyaccess) {
+ mutex_exit(&zp->z_acl_lock);
+ return (0);
+ }
+ }
+ *working_mode &= ~mask_matched;
+ }
+
+ /* Are we done? */
+ if (*working_mode == 0)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ /* Put the found 'denies' back on the working mode */
+ if (deny_mask) {
+ *working_mode |= deny_mask;
+ return (SET_ERROR(EACCES));
+ } else if (*working_mode) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+ uint32_t have = ACE_ALL_PERMS;
+
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
+ }
+ return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int err;
+
+ *working_mode = v4_mode;
+ *check_privs = B_TRUE;
+
+ /*
+ * Short circuit empty requests
+ */
+ if (v4_mode == 0 || zfsvfs->z_replay) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+ *check_privs = B_FALSE;
+ return (err);
+ }
+
+ /*
+ * The caller requested that the ACL check be skipped. This
+ * would only happen if the caller checked VOP_ACCESS() with a
+ * 32 bit ACE mask and already had the appropriate permissions.
+ */
+ if (skipaclchk) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ /*
+ * Note: ZFS_READONLY represents the "DOS R/O" attribute.
+ * When that flag is set, we should behave as if write access
+ * were not granted by anything in the ACL. In particular:
+ * We _must_ allow writes after opening the file r/w, then
+ * setting the DOS R/O attribute, and writing some more.
+ * (Similar to how you can write after fchmod(fd, 0444).)
+ *
+ * Therefore ZFS_READONLY is ignored in the dataset check
+ * above, and checked here as if part of the ACL check.
+ * Also note: DOS R/O is ignored for directories.
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ (ZTOV(zp)->v_type != VDIR) &&
+ (zp->z_pflags & ZFS_READONLY)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+ cred_t *cr)
+{
+ if (*working_mode != ACE_WRITE_DATA)
+ return (SET_ERROR(EACCES));
+
+ return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+ check_privs, B_FALSE, cr));
+}
+
+/*
+ * Check if VEXEC is allowed.
+ *
+ * This routine is based on zfs_fastaccesschk_execute which has slowpath
+ * calling zfs_zaccess. This would be incorrect on FreeBSD (see
+ * zfs_freebsd_access for the difference). Thus this variant let's the
+ * caller handle the slowpath (if necessary).
+ *
+ * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED.
+ *
+ * Safe access to znode_t is provided by the vnode lock.
+ */
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t owner = B_FALSE;
+ boolean_t groupmbr = B_FALSE;
+ boolean_t is_attr;
+ uid_t uid = crgetuid(cr);
+
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+ return (1);
+
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+ (ZTOV(zdp)->v_type == VDIR));
+ if (is_attr)
+ return (1);
+
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
+ return (0);
+
+ mutex_enter(&zdp->z_acl_lock);
+ if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
+ goto out_slow;
+ }
+
+ if (uid == zdp->z_uid) {
+ owner = B_TRUE;
+ if (zdp->z_mode & S_IXUSR) {
+ goto out;
+ } else {
+ goto out_slow;
+ }
+ }
+ if (groupmember(zdp->z_gid, cr)) {
+ groupmbr = B_TRUE;
+ if (zdp->z_mode & S_IXGRP) {
+ goto out;
+ } else {
+ goto out_slow;
+ }
+ }
+ if (!owner && !groupmbr) {
+ if (zdp->z_mode & S_IXOTH) {
+ goto out;
+ }
+ }
+out:
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+out_slow:
+ mutex_exit(&zdp->z_acl_lock);
+ return (1);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+ uint32_t working_mode;
+ int error;
+ int is_attr;
+ boolean_t check_privs;
+ znode_t *xzp = NULL;
+ znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
+
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
+
+#ifdef __FreeBSD_kernel__
+ /*
+ * In FreeBSD, we don't care about permissions of individual ADS.
+ * Note that not checking them is not just an optimization - without
+ * this shortcut, EA operations may bogusly fail with EACCES.
+ */
+ if (zp->z_pflags & ZFS_XATTR)
+ return (0);
+#else
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ uint64_t parent;
+
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+ sizeof (parent))) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zp->z_zfsvfs,
+ parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+#endif
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+ * in needed_bits. Map the bits mapped by working_mode (currently
+ * missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= VEXEC;
+
+ if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+ &check_privs, skipaclchk, cr)) == 0) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits));
+ }
+
+ if (error && !check_privs) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (error);
+ }
+
+ if (error && (flags & V_APPEND)) {
+ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+ }
+
+ if (error && check_privs) {
+ mode_t checkmode = 0;
+ vnode_t *check_vp = ZTOV(check_zp);
+
+ /*
+ * First check for implicit owner permission on
+ * read_acl/read_attributes
+ */
+
+ error = 0;
+ ASSERT(working_mode != 0);
+
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+ owner == crgetuid(cr)))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ checkmode |= VEXEC;
+
+ error = secpolicy_vnode_access2(cr, check_vp, owner,
+ needed_bits & ~checkmode, needed_bits);
+
+ if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+ error = secpolicy_vnode_chown(check_vp, cr, owner);
+ if (error == 0 && (working_mode & ACE_WRITE_ACL))
+ error = secpolicy_vnode_setdac(check_vp, cr, owner);
+
+ if (error == 0 && (working_mode &
+ (ACE_DELETE|ACE_DELETE_CHILD)))
+ error = secpolicy_vnode_remove(check_vp, cr);
+
+ if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+ error = secpolicy_vnode_chown(check_vp, cr, owner);
+ }
+ if (error == 0) {
+ /*
+ * See if any bits other than those already checked
+ * for are still present. If so then return EACCES
+ */
+ if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+ error = SET_ERROR(EACCES);
+ }
+ }
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits);
+ }
+
+
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+
+ return (error);
+}
+
+/*
+ * Translate traditional unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+ mode_t available_perms, cred_t *cr)
+{
+ int error;
+ uid_t downer;
+
+ downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
+
+ error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+ downer, available_perms, VWRITE|VEXEC);
+
+ if (error == 0)
+ error = zfs_sticky_remove_access(dzp, zp, cr);
+
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ uint32_t dzp_working_mode = 0;
+ uint32_t zp_working_mode = 0;
+ int dzp_error, zp_error;
+ mode_t available_perms;
+ boolean_t dzpcheck_privs = B_TRUE;
+ boolean_t zpcheck_privs = B_TRUE;
+
+ /*
+ * We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:joe:write_data:deny,user:joe:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ *
+ * We will ask for all of the necessary permissions and then
+ * look at the working modes from the directory and target object
+ * to determine what was found.
+ */
+
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ return (SET_ERROR(EPERM));
+
+ /*
+ * First row
+ * If the directory permissions allow the delete, we are done.
+ */
+ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ /*
+ * If target object has delete permission then we are done
+ */
+ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+ &zpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ ASSERT(dzp_error && zp_error);
+
+ if (!dzpcheck_privs)
+ return (dzp_error);
+ if (!zpcheck_privs)
+ return (zp_error);
+
+ /*
+ * Second row
+ *
+ * If directory returns EACCES then delete_child was denied
+ * due to deny delete_child. In this case send the request through
+ * secpolicy_vnode_remove(). We don't use zfs_delete_final_check()
+ * since that *could* allow the delete based on write/execute permission
+ * and we want delete permissions to override write/execute.
+ */
+
+ if (dzp_error == EACCES) {
+ /* XXXPJD: s/dzp/zp/ ? */
+ return (secpolicy_vnode_remove(ZTOV(dzp), cr));
+ }
+ /*
+ * Third Row
+ * only need to see if we have write/execute on directory.
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+
+ if (dzp_error != 0 && !dzpcheck_privs)
+ return (dzp_error);
+
+ /*
+ * Fourth row
+ */
+
+ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
+
+ return (zfs_delete_final_check(zp, dzp, available_perms, cr));
+
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ add_perm = (ZTOV(szp)->v_type == VDIR) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ *
+ * BSD operating systems also require write permission
+ * on the directory being moved from one parent directory
+ * to another.
+ */
+ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
+ if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)))
+ return (error);
+ }
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr)))
+ return (error);
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+ return (error);
+}
diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c
new file mode 100644
index 000000000..ed6652c3b
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_ctldir.c
@@ -0,0 +1,1345 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future. The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land. The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
+ * so that it cannot be freed until all snapshots have been unmounted.
+ */
+
+#include <sys/dirent.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/mount.h>
+#include <sys/zap.h>
+#include <sys/sysproto.h>
+
+#include "zfs_namecheck.h"
+
+#include <sys/kernel.h>
+
+/* Common access mode for all virtual directories under the ctldir */
+const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
+
+/*
+ * "Synthetic" filesystem implementation.
+ */
+
+/*
+ * Assert that A implies B.
+ */
+#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg));
+
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+ char sn_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t sn_parent_id;
+ uint64_t sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artifical
+ * IDs, znode IDs) may clash.
+ */
+static int
+sfs_compare_ids(struct vnode *vp, void *arg)
+{
+ sfs_node_t *n1 = vp->v_data;
+ sfs_node_t *n2 = arg;
+ bool equal;
+
+ equal = n1->sn_id == n2->sn_id &&
+ n1->sn_parent_id == n2->sn_parent_id;
+
+ /* Zero means equality. */
+ return (!equal);
+}
+
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ sfs_node_t search;
+ int err;
+
+ search.sn_id = id;
+ search.sn_parent_id = parent_id;
+ err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp,
+ sfs_compare_ids, &search);
+ return (err);
+}
+
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ int err;
+
+ KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+ err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp,
+ sfs_compare_ids, vp->v_data);
+ return (err);
+}
+
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+ vfs_hash_remove(vp);
+}
+
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+ const char *tag, struct vop_vector *vops,
+ sfs_vnode_setup_fn setup, void *arg,
+ struct vnode **vpp)
+{
+ struct vnode *vp;
+ int error;
+
+ error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ /* Allocate a new vnode/inode. */
+ error = getnewvnode(tag, mp, vops, &vp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ /*
+ * Exclusively lock the vnode vnode while it's being constructed.
+ */
+ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+ error = insmntque(vp, mp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ setup(vp, arg);
+
+ error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ *vpp = vp;
+ return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+ printf("\tname = %s\n", node->sn_name);
+ printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+ printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+ struct sfs_node *node;
+
+ KASSERT(strlen(name) < sizeof (node->sn_name),
+ ("sfs node name is too long"));
+ KASSERT(size >= sizeof (*node), ("sfs node size is too small"));
+ node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+ strlcpy(node->sn_name, name, sizeof (node->sn_name));
+ node->sn_parent_id = parent_id;
+ node->sn_id = id;
+
+ return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+ free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+ void *data;
+
+ sfs_vnode_remove(vp);
+ data = vp->v_data;
+ vp->v_data = NULL;
+ return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+ uio_t *uio, off_t *offp)
+{
+ struct dirent entry;
+ int error;
+
+ /* Reset ncookies for subsequent use of vfs_read_dirent. */
+ if (ap->a_ncookies != NULL)
+ *ap->a_ncookies = 0;
+
+ if (uio->uio_resid < sizeof (entry))
+ return (SET_ERROR(EINVAL));
+
+ if (uio->uio_offset < 0)
+ return (SET_ERROR(EINVAL));
+ if (uio->uio_offset == 0) {
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '\0';
+ entry.d_namlen = 1;
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (uio->uio_offset < sizeof (entry))
+ return (SET_ERROR(EINVAL));
+ if (uio->uio_offset == sizeof (entry)) {
+ entry.d_fileno = parent_id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '.';
+ entry.d_name[2] = '\0';
+ entry.d_namlen = 2;
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (offp != NULL)
+ *offp = 2 * sizeof (entry);
+ return (0);
+}
+
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
+ */
+#define ZFSCTL_INO_SNAP(id) (id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares_dir;
+
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+ return (vn_matchops(vp, zfsctl_ops_root) ||
+ vn_matchops(vp, zfsctl_ops_snapdir) ||
+ vn_matchops(vp, zfsctl_ops_snapshot) ||
+ vn_matchops(vp, zfsctl_ops_shares_dir));
+
+}
+
+typedef struct zfsctl_root {
+ sfs_node_t node;
+ sfs_node_t *snapdir;
+ timestruc_t cmtime;
+} zfsctl_root_t;
+
+
+/*
+ * Create the '.zfs' directory.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ zfsctl_root_t *dot_zfs;
+ sfs_node_t *snapdir;
+ vnode_t *rvp;
+ uint64_t crtime[2];
+
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+ ZFSCTL_INO_SNAPDIR);
+ dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0,
+ ZFSCTL_INO_ROOT);
+ dot_zfs->snapdir = snapdir;
+
+ VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
+ VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ &crtime, sizeof (crtime)));
+ ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+ vput(rvp);
+
+ zfsvfs->z_ctldir = dot_zfs;
+}
+
+/*
+ * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+ sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+}
+
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ /* We support shared locking. */
+ VN_LOCK_ASHARE(vp);
+ vp->v_type = VDIR;
+ vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir;
+ err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+ zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir;
+ err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+ &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
+{
+ int error;
+
+ error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+ return (error);
+}
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+ int flags = ap->a_mode;
+
+ if (flags & FWRITE)
+ return (SET_ERROR(EACCES));
+
+ return (0);
+}
+
+/*
+ * Common close routine. Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+/*
+ * Common access routine. Disallow writes.
+ */
+static int
+zfsctl_common_access(struct vop_access_args *ap)
+{
+ accmode_t accmode = ap->a_accmode;
+
+ if (accmode & VWRITE)
+ return (SET_ERROR(EACCES));
+ return (0);
+}
+
+/*
+ * Common getattr function. Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+ timestruc_t now;
+ sfs_node_t *node;
+
+ node = vp->v_data;
+
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ vap->va_rdev = 0;
+ /*
+ * We are a purely virtual object, so we have no
+ * blocksize or allocated blocks.
+ */
+ vap->va_blksize = 0;
+ vap->va_nblocks = 0;
+ vap->va_seq = 0;
+ vn_fsid(vp, vap);
+ vap->va_mode = zfsctl_ctldir_mode;
+ vap->va_type = VDIR;
+ /*
+ * We live in the now (for atime).
+ */
+ gethrestime(&now);
+ vap->va_atime = now;
+ /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_flags = 0;
+
+ vap->va_nodeid = node->sn_id;
+
+ /* At least '.' and '..'. */
+ vap->va_nlink = 2;
+}
+
+#ifndef _OPENSOLARIS_SYS_VNODE_H_
+struct vop_fid_args {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+};
+#endif
+
+static int
+zfsctl_common_fid(struct vop_fid_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ sfs_node_t *node = vp->v_data;
+ uint64_t object = node->sn_id;
+ zfid_short_t *zfid;
+ int i;
+
+ zfid = (zfid_short_t *)fidp;
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs nodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfsctl_common_reclaim(struct vop_reclaim_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+
+ (void) sfs_reclaim_vnode(vp);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_print_args {
+ struct vnode *a_vp;
+};
+#endif
+
+static int
+zfsctl_common_print(struct vop_print_args *ap)
+{
+ sfs_print_node(ap->a_vp->v_data);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Get root directory attributes.
+ */
+static int
+zfsctl_root_getattr(struct vop_getattr_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsctl_root_t *node = vp->v_data;
+
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = node->cmtime;
+ vap->va_mtime = vap->va_ctime;
+ vap->va_birthtime = vap->va_ctime;
+ vap->va_nlink += 1; /* snapdir */
+ vap->va_size = vap->va_nlink;
+ return (0);
+}
+
+/*
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
+ */
+int
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
+{
+ vref(dvp);
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /* Relock for the "." case may left us with reclaimed vnode. */
+ if (VN_IS_DOOMED(dvp)) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(struct vop_lookup_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ int flags = ap->a_cnp->cn_flags;
+ int lkflags = ap->a_cnp->cn_lkflags;
+ int nameiop = ap->a_cnp->cn_nameiop;
+ int err;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ } else if ((flags & ISDOTDOT) != 0) {
+ err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+ lkflags, vpp);
+ } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+ err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+ } else {
+ err = SET_ERROR(ENOENT);
+ }
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfsctl_root_readdir(struct vop_readdir_args *ap)
+{
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_root_t *node = vp->v_data;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
+
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+ if (uio->uio_offset != dots_offset)
+ return (SET_ERROR(EINVAL));
+
+ CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name));
+ entry.d_fileno = node->snapdir->sn_id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, node->snapdir->sn_name);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ return (SET_ERROR(error));
+ }
+ if (eofp != NULL)
+ *eofp = 1;
+ return (0);
+}
+
+static int
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
+{
+ static const char dotzfs_name[4] = ".zfs";
+ vnode_t *dvp;
+ int error;
+
+ if (*ap->a_buflen < sizeof (dotzfs_name))
+ return (SET_ERROR(ENOMEM));
+
+ error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+ LK_SHARED, &dvp);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ VOP_UNLOCK1(dvp);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= sizeof (dotzfs_name);
+ bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
+ return (0);
+}
+
+static int
+zfsctl_common_pathconf(struct vop_pathconf_args *ap)
+{
+ /*
+ * We care about ACL variables so that user land utilities like ls
+ * can display them correctly. Since the ctldir's st_dev is set to be
+ * the same as the parent dataset, we must support all variables that
+ * it supports.
+ */
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *ap->a_retval = 64;
+ return (0);
+
+ case _PC_MIN_HOLE_SIZE:
+ *ap->a_retval = (int)SPA_MINBLOCKSIZE;
+ return (0);
+
+ case _PC_ACL_EXTENDED:
+ *ap->a_retval = 0;
+ return (0);
+
+ case _PC_ACL_NFS4:
+ *ap->a_retval = 1;
+ return (0);
+
+ case _PC_ACL_PATH_MAX:
+ *ap->a_retval = ACL_MAX_ENTRIES;
+ return (0);
+
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+/*
+ * Returns a trivial ACL
+ */
+int
+zfsctl_common_getacl(struct vop_getacl_args *ap)
+{
+ int i;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
+ /*
+ * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+ * attributes. That is not the case for the ctldir, so we must clear
+ * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+ * aren't supported by the ctldir.
+ */
+ for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+ struct acl_entry *entry;
+ entry = &(ap->a_aclp->acl_entry[i]);
+ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+ ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+ ACL_READ_NAMED_ATTRS);
+ }
+
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_root_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_root_readdir,
+ .vop_lookup = zfsctl_root_lookup,
+ .vop_inactive = VOP_NULL,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_vptocnp = zfsctl_root_vptocnp,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+ dmu_objset_name(os, zname);
+ if (strlen(zname) + 1 + strlen(name) >= len)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strcat(zname, "@");
+ (void) strcat(zname, name);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+ int err;
+
+ err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
+ return (err);
+}
+
+/*
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any. The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked. If any error happens the orinal vnode is unlocked and
+ * released.
+ */
+static int
+zfsctl_mounted_here(vnode_t **vpp, int flags)
+{
+ struct mount *mp;
+ int err;
+
+ ASSERT_VOP_LOCKED(*vpp, __func__);
+ ASSERT3S((*vpp)->v_type, ==, VDIR);
+
+ if ((mp = (*vpp)->v_mountedhere) != NULL) {
+ err = vfs_busy(mp, 0);
+ KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+ KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+ vput(*vpp);
+ err = VFS_ROOT(mp, flags, vpp);
+ vfs_unbusy(mp);
+ return (err);
+ }
+ return (EJUSTRETURN);
+}
+
+typedef struct {
+ const char *snap_name;
+ uint64_t snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
+{
+ snapshot_setup_arg_t *ssa = arg;
+ sfs_node_t *node;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ node = sfs_alloc_node(sizeof (sfs_node_t),
+ ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+ zfsctl_common_vnode_setup(vp, node);
+
+ /* We have to support recursive locking. */
+ VN_LOCK_AREC(vp);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
+ */
+int
+zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ char name[NAME_MAX + 1];
+ char fullname[ZFS_MAX_DATASET_NAME_LEN];
+ char *mountpoint;
+ size_t mountpoint_len;
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ uint64_t snap_id;
+ int nameiop = cnp->cn_nameiop;
+ int lkflags = cnp->cn_lkflags;
+ int flags = cnp->cn_flags;
+ int err;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ return (err);
+ }
+ if (flags & ISDOTDOT) {
+ err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+ vpp);
+ return (err);
+ }
+
+ if (cnp->cn_namelen >= sizeof (name))
+ return (SET_ERROR(ENAMETOOLONG));
+
+ strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+ err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+ if (err != 0)
+ return (SET_ERROR(ENOENT));
+
+ for (;;) {
+ snapshot_setup_arg_t ssa;
+
+ ssa.snap_name = name;
+ ssa.snap_id = snap_id;
+ err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+ snap_id, "zfs", &zfsctl_ops_snapshot,
+ zfsctl_snapshot_vnode_setup, &ssa, vpp);
+ if (err != 0)
+ return (err);
+
+ /* Check if a new vnode has just been created. */
+ if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+ break;
+
+ /*
+ * Check if a snapshot is already mounted on top of the vnode.
+ */
+ err = zfsctl_mounted_here(vpp, lkflags);
+ if (err != EJUSTRETURN)
+ return (err);
+
+ /*
+ * If the vnode is not covered, then either the mount operation
+ * is in progress or the snapshot has already been unmounted
+ * but the vnode hasn't been inactivated and reclaimed yet.
+ * We can try to re-use the vnode in the latter case.
+ */
+ VI_LOCK(*vpp);
+ if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+ /*
+ * Upgrade to exclusive lock in order to:
+ * - avoid race conditions
+ * - satisfy the contract of mount_snapshot()
+ */
+ err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+ if (err == 0)
+ break;
+ } else {
+ VI_UNLOCK(*vpp);
+ }
+
+ /*
+ * In this state we can loop on uncontested locks and starve
+ * the thread doing the lengthy, non-trivial mount operation.
+ * So, yield to prevent that from happening.
+ */
+ vput(*vpp);
+ kern_yield(PRI_USER);
+ }
+
+ VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname));
+
+ mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+ strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
+ mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+ (void) snprintf(mountpoint, mountpoint_len,
+ "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
+ dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+ err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+ kmem_free(mountpoint, mountpoint_len);
+ if (err == 0) {
+ /*
+ * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
+ *
+ * This is where we lie about our v_vfsp in order to
+ * make .zfs/snapshot/<snapname> accessible over NFS
+ * without requiring manual mounts of <snapname>.
+ */
+ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+ VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+ /* Clear the root flag (set via VFS_ROOT) as well. */
+ (*vpp)->v_vflag &= ~VV_ROOT;
+ }
+
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
+
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+
+ ZFS_ENTER(zfsvfs);
+ for (;;) {
+ uint64_t cookie;
+ uint64_t id;
+
+ cookie = uio->uio_offset - dots_offset;
+
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT) {
+ if (eofp != NULL)
+ *eofp = 1;
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, snapname);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof (entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(error));
+ }
+ uio->uio_offset = cookie + dots_offset;
+ }
+ /* NOTREACHED */
+}
+
+static int
+zfsctl_snapdir_getattr(struct vop_getattr_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
+ uint64_t snap_count;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_mtime = vap->va_ctime;
+ vap->va_birthtime = vap->va_ctime;
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+ err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (err != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ vap->va_nlink += snap_count;
+ }
+ vap->va_size = vap->va_nlink;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_getattr = zfsctl_snapdir_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_snapdir_readdir,
+ .vop_lookup = zfsctl_snapdir_lookup,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
+
+
+static int
+zfsctl_snapshot_inactive(struct vop_inactive_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+
+ VERIFY(vrecycle(vp) == 1);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ void *data = vp->v_data;
+
+ sfs_reclaim_vnode(vp);
+ sfs_destroy_node(data);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
+{
+ struct mount *mp;
+ vnode_t *dvp;
+ vnode_t *vp;
+ sfs_node_t *node;
+ size_t len;
+ int locked;
+ int error;
+
+ vp = ap->a_vp;
+ node = vp->v_data;
+ len = strlen(node->sn_name);
+ if (*ap->a_buflen < len)
+ return (SET_ERROR(ENOMEM));
+
+ /*
+ * Prevent unmounting of the snapshot while the vnode lock
+ * is not held. That is not strictly required, but allows
+ * us to assert that an uncovered snapshot vnode is never
+ * "leaked".
+ */
+ mp = vp->v_mountedhere;
+ if (mp == NULL)
+ return (SET_ERROR(ENOENT));
+ error = vfs_busy(mp, 0);
+ KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
+
+ /*
+ * We can vput the vnode as we can now depend on the reference owned
+ * by the busied mp. But we also need to hold the vnode, because
+ * the reference may go after vfs_unbusy() which has to be called
+ * before we can lock the vnode again.
+ */
+ locked = VOP_ISLOCKED(vp);
+#if __FreeBSD_version >= 1300045
+ enum vgetstate vs = vget_prep(vp);
+#else
+ vhold(vp);
+#endif
+ vput(vp);
+
+ /* Look up .zfs/snapshot, our parent. */
+ error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+ if (error == 0) {
+ VOP_UNLOCK1(dvp);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= len;
+ bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
+ }
+ vfs_unbusy(mp);
+#if __FreeBSD_version >= 1300045
+ vget_finish(vp, locked | LK_RETRY, vs);
+#else
+ vget(vp, locked | LK_VNHELD | LK_RETRY, curthread);
+#endif
+ return (error);
+}
+
+/*
+ * These VP's should never see the light of day. They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+ .vop_default = NULL, /* ensure very restricted access */
+ .vop_inactive = zfsctl_snapshot_inactive,
+#if __FreeBSD_version >= 1300045
+ .vop_need_inactive = vop_stdneed_inactive,
+#endif
+ .vop_reclaim = zfsctl_snapshot_reclaim,
+ .vop_vptocnp = zfsctl_snapshot_vptocnp,
+ .vop_lock1 = vop_stdlock,
+ .vop_unlock = vop_stdunlock,
+ .vop_islocked = vop_stdislocked,
+ .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */
+ .vop_print = zfsctl_common_print,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+ zfsvfs_t *zfsvfs __unused = vfsp->vfs_data;
+ vnode_t *vp;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ *zfsvfsp = NULL;
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+ if (error == 0 && vp != NULL) {
+ /*
+ * XXX Probably need to at least reference, if not busy, the mp.
+ */
+ if (vp->v_mountedhere != NULL)
+ *zfsvfsp = vp->v_mountedhere->mnt_data;
+ vput(vp);
+ }
+ if (*zfsvfsp == NULL)
+ return (SET_ERROR(EINVAL));
+ return (0);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem. This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct mount *mp;
+ vnode_t *vp;
+ uint64_t cookie;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+
+ cookie = 0;
+ for (;;) {
+ uint64_t id;
+
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
+
+ for (;;) {
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, id, &vp);
+ if (error != 0 || vp == NULL)
+ break;
+
+ mp = vp->v_mountedhere;
+
+ /*
+ * v_mountedhere being NULL means that the
+ * (uncovered) vnode is in a transient state
+ * (mounting or unmounting), so loop until it
+ * settles down.
+ */
+ if (mp != NULL)
+ break;
+ vput(vp);
+ }
+ if (error != 0)
+ break;
+ if (vp == NULL)
+ continue; /* no mountpoint, nothing to do */
+
+ /*
+ * The mount-point vnode is kept locked to avoid spurious EBUSY
+ * from a concurrent umount.
+ * The vnode lock must have recursive locking enabled.
+ */
+ vfs_ref(mp);
+ error = dounmount(mp, fflags, curthread);
+ KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+ ("extra references after unmount"));
+ vput(vp);
+ if (error != 0)
+ break;
+ }
+ KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+ ("force unmounting failed"));
+ return (error);
+}
+
+int
+zfsctl_snapshot_unmount(char *snapname, int flags __unused)
+{
+ vfs_t *vfsp = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+
+ if (strchr(snapname, '@') == NULL)
+ return (0);
+
+ int err = getzfsvfs(snapname, &zfsvfs);
+ if (err != 0) {
+ ASSERT3P(zfsvfs, ==, NULL);
+ return (0);
+ }
+ vfsp = zfsvfs->z_vfs;
+
+ ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
+
+ vfs_ref(vfsp);
+ vfs_unbusy(vfsp);
+ return (dounmount(vfsp, MS_FORCE, curthread));
+}
diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c
new file mode 100644
index 000000000..2f5962b25
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_debug.c
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/kstat.h>
+
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ int zdm_size;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+kstat_t *zfs_dbgmsg_kstat;
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * # Disable the kernel debug message log.
+ * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_headers(char *buf, size_t size)
+{
+ (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message");
+
+ return (0);
+}
+
+static int
+zfs_dbgmsg_data(char *buf, size_t size, void *data)
+{
+ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data;
+
+ (void) snprintf(buf, size, "%-12llu %-s\n",
+ (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+
+ return (0);
+}
+
+static void *
+zfs_dbgmsg_addr(kstat_t *ksp, loff_t n)
+{
+ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private;
+
+ ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+ if (n == 0)
+ ksp->ks_private = list_head(&zfs_dbgmsgs);
+ else if (zdm)
+ ksp->ks_private = list_next(&zfs_dbgmsgs, zdm);
+
+ return (ksp->ks_private);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+ zfs_dbgmsg_t *zdm;
+ int size;
+
+ ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+ while (zfs_dbgmsg_size > max_size) {
+ zdm = list_remove_head(&zfs_dbgmsgs);
+ if (zdm == NULL)
+ return;
+
+ size = zdm->zdm_size;
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+}
+
+static int
+zfs_dbgmsg_update(kstat_t *ksp, int rw)
+{
+ if (rw == KSTAT_WRITE)
+ zfs_dbgmsg_purge(0);
+
+ return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+ offsetof(zfs_dbgmsg_t, zdm_node));
+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ if (zfs_dbgmsg_kstat) {
+ zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock;
+ zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX;
+ zfs_dbgmsg_kstat->ks_private = NULL;
+ zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update;
+ kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers,
+ zfs_dbgmsg_data, zfs_dbgmsg_addr);
+ kstat_install(zfs_dbgmsg_kstat);
+ }
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ if (zfs_dbgmsg_kstat)
+ kstat_delete(zfs_dbgmsg_kstat);
+ /*
+ * TODO - decide how to make this permanent
+ */
+#ifdef _KERNEL
+ mutex_enter(&zfs_dbgmsgs_lock);
+ zfs_dbgmsg_purge(0);
+ mutex_exit(&zfs_dbgmsgs_lock);
+ mutex_destroy(&zfs_dbgmsgs_lock);
+#endif
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+ zfs_dbgmsg_t *zdm;
+ int size;
+
+ DTRACE_PROBE1(zfs__dbgmsg, char *, buf);
+
+ size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+ zdm = kmem_zalloc(size, KM_SLEEP);
+ zdm->zdm_size = size;
+ zdm->zdm_timestamp = gethrestime_sec();
+ strcpy(zdm->zdm_msg, buf);
+
+ mutex_enter(&zfs_dbgmsgs_lock);
+ list_insert_tail(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += size;
+ zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+ /*
+ * To enable this:
+ *
+ * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+ */
+ if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+ __dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+#ifdef _KERNEL
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+ int line, const char *fmt, ...)
+{
+ const char *newfile;
+ va_list adx;
+ size_t size;
+ char *buf;
+ char *nl;
+ int i;
+
+ size = 1024;
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Get rid of annoying prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
+
+ if (i < size) {
+ va_start(adx, fmt);
+ (void) vsnprintf(buf + i, size - i, fmt, adx);
+ va_end(adx);
+ }
+
+ /*
+ * Get rid of trailing newline.
+ */
+ nl = strrchr(buf, '\n');
+ if (nl != NULL)
+ *nl = '\0';
+
+ __zfs_dbgmsg(buf);
+
+ kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ zfs_dbgmsg_t *zdm;
+
+ (void) printf("ZFS_DBGMSG(%s):\n", tag);
+ mutex_enter(&zfs_dbgmsgs_lock);
+ for (zdm = list_head(&zfs_dbgmsgs); zdm;
+ zdm = list_next(&zfs_dbgmsgs, zdm))
+ (void) printf("%s\n", zdm->zdm_msg);
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
+
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c
new file mode 100644
index 000000000..e93b3e2cf
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_dir.c
@@ -0,0 +1,961 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/extdirent.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+ matchtype_t mt, uint64_t *zoid)
+{
+ int error;
+
+ if (zfsvfs->z_norm) {
+
+ /*
+ * In the non-mixed case we only expect there would ever
+ * be one match, but we need to use the normalizing lookup.
+ */
+ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+ zoid, mt, NULL, 0, NULL);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+ }
+ *zoid = ZFS_DIRENT_OBJ(*zoid);
+
+ return (error);
+}
+
+/*
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ matchtype_t mt = 0;
+ uint64_t zoid;
+ int error = 0;
+
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
+ *zpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if (name[0] == '.' &&
+ (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) ||
+ (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)))
+ return (SET_ERROR(EEXIST));
+
+ /*
+ * Case sensitivity and normalization preferences are set when
+ * the file system is created. These are stored in the
+ * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
+ * affect how we perform zap lookups.
+ *
+ * When matching we may need to normalize & change case according to
+ * FS settings.
+ *
+ * Note that a normalized match is necessary for a case insensitive
+ * filesystem when the lookup request is not exact because normalization
+ * can fold case independent of normalizing code point sequences.
+ *
+ * See the table above zfs_dropname().
+ */
+ if (zfsvfs->z_norm != 0) {
+ mt = MT_NORMALIZE;
+
+ /*
+ * Determine if the match needs to honor the case specified in
+ * lookup, and if so keep track of that so that during
+ * normalization we don't fold case.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+ mt |= MT_MATCH_CASE;
+ }
+ }
+
+ /*
+ * Only look in or update the DNLC if we are looking for the
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name.
+ *
+ * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+ * because in that case MT_EXACT and MT_FIRST should produce exactly
+ * the same result.
+ */
+
+ if (dzp->z_unlinked && !(flag & ZXATTR))
+ return (ENOENT);
+ if (flag & ZXATTR) {
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? ENOENT : 0);
+ } else {
+ error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ return (SET_ERROR(EEXIST));
+ }
+ error = zfs_zget(zfsvfs, zoid, &zp);
+ if (error)
+ return (error);
+ ASSERT(!zp->z_unlinked);
+ *zpp = zp;
+ }
+
+ return (0);
+}
+
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ uint64_t parent;
+ int error;
+
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+ if (dzp->z_unlinked)
+ return (ENOENT);
+
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *zpp = zp;
+ return (error);
+}
+
+int
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
+{
+ zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs;
+ znode_t *zp = NULL;
+ int error = 0;
+
+#ifdef ZFS_DEBUG
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+#endif
+ if (dzp->z_unlinked)
+ return (SET_ERROR(ENOENT));
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *zpp = dzp;
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ error = zfs_dd_lookup(dzp, &zp);
+ if (error == 0)
+ *zpp = zp;
+ } else {
+ error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
+ if (error == 0) {
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ *zpp = zp;
+ }
+ }
+ return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp->z_unlinked);
+ ASSERT(zp->z_links == 0);
+
+ VERIFY3U(0, ==,
+ zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ dmu_tx_t *tx;
+ int error;
+
+ /*
+ * Interate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+
+ /*
+ * Due to changes in zfs_rmnode we need to make sure the
+ * link count is set to zero here.
+ */
+ if (zp->z_links != 0) {
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ vput(ZTOV(zp));
+ continue;
+ }
+ zp->z_links = 0;
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &zp->z_links, sizeof (zp->z_links), tx));
+ dmu_tx_commit(tx);
+ }
+
+ zp->z_unlinked = B_TRUE;
+ vput(ZTOV(zp));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ if (error) {
+ skipped += 1;
+ continue;
+ }
+
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
+ ASSERT((ZTOV(xzp)->v_type == VREG) ||
+ (ZTOV(xzp)->v_type == VLNK));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ vput(ZTOV(xzp));
+ skipped += 1;
+ continue;
+ }
+
+ error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
+ if (error)
+ skipped += 1;
+ dmu_tx_commit(tx);
+
+ vput(ZTOV(xzp));
+ }
+ zap_cursor_fini(&zc);
+ if (error != ENOENT)
+ skipped += 1;
+ return (skipped);
+}
+
+extern taskq_t *zfsvfs_taskq;
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ uint64_t xattr_obj;
+ uint64_t count;
+ int error;
+
+ ASSERT(zp->z_links == 0);
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+ (zp->z_pflags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+ } else {
+ /*
+ * Free up all the data in the file. We don't do this for
+ * XATTR directories because we need truncate and remove to be
+ * in the same tx, like in zfs_znode_delete(). Otherwise, if
+ * we crash here we'll end up with an inconsistent truncated
+ * zap object in the delete queue. Note a truncated file is
+ * harmless since it only contains user data.
+ */
+ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+ if (error) {
+ /*
+ * Not enough space or we were interrupted by unmount.
+ * Leave the file in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error)
+ xattr_obj = 0;
+
+ acl_obj = zfs_external_acl(zp);
+
+ /*
+ * Set up the final transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xattr_obj)
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+
+ /*
+ * FreeBSD's implemention of zfs_zget requires a vnode to back it.
+ * This means that we could end up calling into getnewvnode while
+ * calling zfs_rmnode as a result of a prior call to getnewvnode
+ * trying to clear vnodes out of the cache. If this repeats we can
+ * recurse enough that we overflow our stack. To avoid this, we
+ * avoid calling zfs_zget on the xattr znode and instead simply add
+ * it to the unlinked set and schedule a call to zfs_unlinked_drain.
+ */
+ if (xattr_obj) {
+ /* Add extended attribute directory to the unlinked set. */
+ VERIFY3U(0, ==,
+ zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
+ }
+
+ mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+ /* Remove this znode from the unlinked set */
+ VERIFY3U(0, ==,
+ zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+ cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+ }
+
+ mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+
+ if (xattr_obj) {
+ /*
+ * We're using the FreeBSD taskqueue API here instead of
+ * the Solaris taskq API since the FreeBSD API allows for a
+ * task to be enqueued multiple times but executed once.
+ */
+ taskqueue_enqueue(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task);
+ }
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+ uint64_t de = zp->z_id;
+
+ if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+ de |= IFTODT(mode) << 60;
+ return (de);
+}
+
+/*
+ * Link zp into dzp. Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ uint64_t value;
+ int zp_is_dir = (vp->v_type == VDIR);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
+ int error;
+
+ if (zfsvfs->z_replay == B_FALSE) {
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ }
+ if (zp_is_dir) {
+ if (dzp->z_links >= ZFS_LINK_MAX)
+ return (SET_ERROR(EMLINK));
+ }
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ return (SET_ERROR(ENOENT));
+ }
+ if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
+ return (SET_ERROR(EMLINK));
+ }
+ zp->z_links++;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+
+ } else {
+ ASSERT(zp->z_unlinked == 0);
+ }
+ value = zfs_dirent(zp, zp->z_mode);
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
+ 8, 1, &value, tx);
+
+ /*
+ * zap_add could fail to add the entry if it exceeds the capacity of the
+ * leaf-block and zap_leaf_split() failed to help.
+ * The caller of this routine is responsible for failing the transaction
+ * which will rollback the SA updates done above.
+ */
+ if (error != 0) {
+ if (!(flag & ZRENAMING) && !(flag & ZNEW))
+ zp->z_links--;
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ dzp->z_size++;
+ dzp->z_links += zp_is_dir;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+ return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type | z_norm | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0 | 0 | 0 (exact)
+ * CS norm | formX | 0 | MT_NORMALIZE
+ * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
+ * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | ZCILOOK | MT_NORMALIZE
+ * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ * formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (zp->z_zfsvfs->z_norm) {
+ matchtype_t mt = MT_NORMALIZE;
+
+ if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
+ mt |= MT_MATCH_CASE;
+ }
+
+ error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
+ name, mt, tx);
+ } else {
+ error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
+ }
+
+ return (error);
+}
+
+/*
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag, boolean_t *unlinkedp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
+ int error;
+
+ if (zfsvfs->z_replay == B_FALSE) {
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ }
+ if (!(flag & ZRENAMING)) {
+
+ if (zp_is_dir && !zfs_dirempty(zp))
+ return (SET_ERROR(ENOTEMPTY));
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dzp, name, zp, tx, flag);
+ if (error != 0) {
+ return (error);
+ }
+
+ if (zp->z_links <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on vnode %p is %u, "
+ "should be at least %u", zp->z_vnode,
+ (int)zp->z_links,
+ zp_is_dir + 1);
+ zp->z_links = zp_is_dir + 1;
+ }
+ if (--zp->z_links == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ zp->z_links = 0;
+ unlinked = B_TRUE;
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &zp->z_links, sizeof (zp->z_links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT0(error);
+ } else {
+ ASSERT(zp->z_unlinked == 0);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
+ if (error != 0)
+ return (error);
+ }
+
+ dzp->z_size--; /* one dirent removed */
+ dzp->z_links -= zp_is_dir; /* ".." link from zp */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ return (dzp->z_size == 2);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t parent __unused;
+
+ *xvpp = NULL;
+
+ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+ &acl_ids)) != 0)
+ return (error);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) {
+ zfs_acl_ids_free(&acl_ids);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ getnewvnode_reserve_();
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ return (error);
+ }
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
+
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+ xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ *xvpp = xzp;
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xzpp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xzpp = xzp;
+ return (0);
+ }
+
+
+ if (!(flags & CREATE_XATTR_DIR))
+ return (SET_ERROR(ENOATTR));
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = AT_MODE | AT_UID | AT_GID;
+ va.va_type = VDIR;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+ error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+
+ if (error == ERESTART) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+ if (error == 0)
+ VOP_UNLOCK1(ZTOV(*xzpp));
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * the entry is a plain file and you have write access,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+ uid_t downer;
+ uid_t fowner;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+
+ if (zdp->z_zfsvfs->z_replay)
+ return (0);
+
+ if ((zdp->z_mode & S_ISVTX) == 0)
+ return (0);
+
+ downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+ if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+ (ZTOV(zp)->v_type == VREG &&
+ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
+ return (0);
+ else
+ return (secpolicy_vnode_remove(ZTOV(zp), cr));
+}
diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c
new file mode 100644
index 000000000..ec7c04717
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_file_os.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_recv.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_file.h>
+#include <sys/buf.h>
+#include <sys/stat.h>
+
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+ struct thread *td;
+ int rc, fd;
+
+ td = curthread;
+ pwd_ensure_dirs();
+ /* 12.x doesn't take a const char * */
+ rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path),
+ UIO_SYSSPACE, flags, mode);
+ if (rc)
+ return (SET_ERROR(rc));
+ fd = td->td_retval[0];
+ td->td_retval[0] = 0;
+ if (fget(curthread, fd, &cap_no_rights, fpp))
+ kern_close(td, fd);
+ return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+ fo_close(fp, curthread);
+}
+
+static int
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp,
+ ssize_t *resid)
+{
+ ssize_t rc;
+ struct uio auio;
+ struct thread *td;
+ struct iovec aiov;
+
+ td = curthread;
+ aiov.iov_base = (void *)(uintptr_t)buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = count;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = *offp;
+
+ if ((fp->f_flag & FWRITE) == 0)
+ return (SET_ERROR(EBADF));
+
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+
+ rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+ if (rc)
+ return (SET_ERROR(rc));
+ if (resid)
+ *resid = auio.uio_resid;
+ else if (auio.uio_resid)
+ return (SET_ERROR(EIO));
+ *offp += count - auio.uio_resid;
+ return (rc);
+}
+
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+ loff_t off = fp->f_offset;
+ ssize_t rc;
+
+ rc = zfs_file_write_impl(fp, buf, count, &off, resid);
+ if (rc == 0)
+ fp->f_offset = off;
+
+ return (SET_ERROR(rc));
+}
+
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+ ssize_t *resid)
+{
+ return (zfs_file_write_impl(fp, buf, count, &off, resid));
+}
+
+static int
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
+ ssize_t *resid)
+{
+ ssize_t rc;
+ struct uio auio;
+ struct thread *td;
+ struct iovec aiov;
+
+ td = curthread;
+ aiov.iov_base = (void *)(uintptr_t)buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = count;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ auio.uio_offset = *offp;
+
+ if ((fp->f_flag & FREAD) == 0)
+ return (SET_ERROR(EBADF));
+
+ rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+ if (rc)
+ return (SET_ERROR(rc));
+ *resid = auio.uio_resid;
+ *offp += count - auio.uio_resid;
+ return (SET_ERROR(0));
+}
+
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+ loff_t off = fp->f_offset;
+ ssize_t rc;
+
+ rc = zfs_file_read_impl(fp, buf, count, &off, resid);
+ if (rc == 0)
+ fp->f_offset = off;
+ return (rc);
+}
+
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+ ssize_t *resid)
+{
+ return (zfs_file_read_impl(fp, buf, count, &off, resid));
+}
+
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+ int rc;
+ struct thread *td;
+
+ td = curthread;
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
+ return (SET_ERROR(ESPIPE));
+ rc = fo_seek(fp, *offp, whence, td);
+ if (rc == 0)
+ *offp = td->td_uretoff.tdu_off;
+ return (SET_ERROR(rc));
+}
+
+int
+zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
+{
+ struct thread *td;
+ struct stat sb;
+ int rc;
+
+ td = curthread;
+
+ rc = fo_stat(fp, &sb, td->td_ucred, td);
+ if (rc)
+ return (SET_ERROR(rc));
+ zfattr->zfa_size = sb.st_size;
+ zfattr->zfa_mode = sb.st_mode;
+
+ return (0);
+}
+
+static __inline int
+zfs_vop_fsync(vnode_t *vp)
+{
+ struct mount *mp;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto drop;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_FSYNC(vp, MNT_WAIT, curthread);
+ VOP_UNLOCK1(vp);
+ vn_finished_write(mp);
+drop:
+ return (SET_ERROR(error));
+}
+
+int
+zfs_file_fsync(zfs_file_t *fp, int flags)
+{
+ struct vnode *v;
+
+ if (fp->f_type != DTYPE_VNODE)
+ return (EINVAL);
+
+ v = fp->f_data;
+ return (zfs_vop_fsync(v));
+}
+
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+ struct file *fp;
+
+ if (fget(curthread, fd, &cap_no_rights, &fp))
+ return (SET_ERROR(EBADF));
+
+ *fpp = fp;
+ return (0);
+}
+
+void
+zfs_file_put(int fd)
+{
+ struct file *fp;
+
+ /* No CAP_ rights required, as we're only releasing. */
+ if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
+ fdrop(fp, curthread);
+ fdrop(fp, curthread);
+ }
+}
+
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+ return (fp->f_offset);
+}
+
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+ file_t *tmpfp;
+ void *data;
+ int error;
+
+ tmpfp = curthread->td_fpop;
+ curthread->td_fpop = fp;
+ error = devfs_get_cdevpriv(&data);
+ curthread->td_fpop = tmpfp;
+ if (error != 0)
+ return (NULL);
+ return (data);
+}
+
+int
+zfs_file_unlink(const char *fnamep)
+{
+ enum uio_seg seg = UIO_SYSSPACE;
+ int rc;
+
+#if __FreeBSD_version >= 1300018
+ rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
+#else
+#ifdef AT_BENEATH
+ rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0);
+#else
+ rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+ seg, 0);
+#endif
+#endif
+ return (SET_ERROR(rc));
+}
diff --git a/module/os/freebsd/zfs/zfs_fuid_os.c b/module/os/freebsd/zfs/zfs_fuid_os.c
new file mode 100644
index 000000000..ebd09abd6
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_fuid_os.c
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/avl.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/nvpair.h>
+#ifdef _KERNEL
+#include <sys/sid.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#endif
+#include <sys/zfs_fuid.h>
+
+uint64_t
+zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
+ cred_t *cr, zfs_fuid_info_t **fuidp)
+{
+ uid_t id;
+
+ VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
+
+ id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+ if (IS_EPHEMERAL(id))
+ return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
+
+ return ((uint64_t)id);
+}
diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000..4b7e86467
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_ioctl_os.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_redact.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/pathname.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/dmu_recv.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
+#include <sys/zfs_ioctl_impl.h>
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+ int error = 0;
+
+ if (*zfvp == NULL)
+ return (SET_ERROR(ESRCH));
+
+ error = vfs_busy((*zfvp)->z_vfs, 0);
+ if (error != 0) {
+ *zfvp = NULL;
+ error = SET_ERROR(ESRCH);
+ }
+ return (error);
+}
+
+int
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+ return (zfsvfs->z_vfs != NULL);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+ vfs_unbusy(zfsvfs->z_vfs);
+}
+
+static const zfs_ioc_key_t zfs_keys_nextboot[] = {
+ {"command", DATA_TYPE_STRING, 0},
+ { ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0},
+ { ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0}
+};
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+ (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+ (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ char name[MAXNAMELEN];
+ spa_t *spa;
+ vdev_t *vd;
+ char *command;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+ int error;
+
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_string(innvl,
+ "command", &command) != 0)
+ return (EINVAL);
+
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_by_guid(pool_guid, vdev_guid);
+ if (spa != NULL)
+ strcpy(name, spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+ if (spa == NULL)
+ return (ENOENT);
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+ if (vd == NULL) {
+ (void) spa_vdev_state_exit(spa, NULL, ENXIO);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+ error = vdev_label_write_pad2(vd, command, strlen(command));
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+
+void
+zfs_ioctl_init_os(void)
+{
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
+ zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
+ zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+ zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+ POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3);
+
+}
diff --git a/module/os/freebsd/zfs/zfs_onexit_os.c b/module/os/freebsd/zfs/zfs_onexit_os.c
new file mode 100644
index 000000000..8b22f2fdc
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_onexit_os.c
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+ *zo = zfsdev_get_state(minor, ZST_ONEXIT);
+ if (*zo == NULL)
+ return (SET_ERROR(EBADF));
+
+ return (0);
+}
+
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ file_t *fp, *tmpfp;
+ zfs_onexit_t *zo;
+ void *data;
+ int error;
+
+ if ((error = zfs_file_get(fd, &fp)))
+ return (error);
+
+ tmpfp = curthread->td_fpop;
+ curthread->td_fpop = fp;
+ error = devfs_get_cdevpriv(&data);
+ if (error == 0)
+ *minorp = (minor_t)(uintptr_t)data;
+ curthread->td_fpop = tmpfp;
+ if (error != 0)
+ return (SET_ERROR(EBADF));
+ return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+ zfs_file_put(fd);
+}
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
new file mode 100644
index 000000000..d6f7fc11e
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -0,0 +1,2448 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <[email protected]>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/jail.h>
+#include <ufs/ufs/quota.h>
+#include <sys/zfs_quota.h>
+
+#include "zfs_comutil.h"
+
+#ifndef MNTK_VMSETSIZE_BUG
+#define MNTK_VMSETSIZE_BUG 0
+#endif
+#ifndef MNTK_NOMSYNC
+#define MNTK_NOMSYNC 8
+#endif
+
+/* BEGIN CSTYLED */
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+
+int zfs_super_owner;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+ "File system owner can perform privileged operation on his file systems");
+
+int zfs_debug_level;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
+ "Debug level");
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+ "ZFS_ACL_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+ "SPA_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+ "ZPL_VERSION");
+/* END CSTYLED */
+
+static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
+static int zfs_mount(vfs_t *vfsp);
+static int zfs_umount(vfs_t *vfsp, int fflag);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int **secflavors);
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
+static void zfs_freevfs(vfs_t *vfsp);
+
+struct vfsops zfs_vfsops = {
+ .vfs_mount = zfs_mount,
+ .vfs_unmount = zfs_umount,
+#if __FreeBSD_version >= 1300049
+ .vfs_root = vfs_cache_root,
+ .vfs_cachedroot = zfs_root,
+#else
+ .vfs_root = zfs_root,
+#endif
+ .vfs_statfs = zfs_statfs,
+ .vfs_vget = zfs_vget,
+ .vfs_sync = zfs_sync,
+ .vfs_checkexp = zfs_checkexp,
+ .vfs_fhtovp = zfs_fhtovp,
+ .vfs_quotactl = zfs_quotactl,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t zfs_active_fs_count = 0;
+
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+ char *setpoint)
+{
+ int error;
+ zfsvfs_t *zfvp;
+ vfs_t *vfsp;
+ objset_t *os;
+ uint64_t tmp = *val;
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ error = getzfsvfs_impl(os, &zfvp);
+ if (error != 0)
+ return (error);
+ if (zfvp == NULL)
+ return (ENOENT);
+ vfsp = zfvp->z_vfs;
+ switch (zfs_prop) {
+ case ZFS_PROP_ATIME:
+ if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_DEVICES:
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_EXEC:
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_SETUID:
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_READONLY:
+ if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_XATTR:
+ if (zfvp->z_flags & ZSB_XATTR)
+ tmp = zfvp->z_xattr;
+ break;
+ case ZFS_PROP_NBMAND:
+ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
+ tmp = 1;
+ break;
+ default:
+ vfs_unbusy(vfsp);
+ return (ENOENT);
+ }
+
+ vfs_unbusy(vfsp);
+ if (tmp != *val) {
+ (void) strcpy(setpoint, "temporary");
+ *val = tmp;
+ }
+ return (0);
+}
+
+static int
+zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
+{
+ int error = 0;
+ char buf[32];
+ uint64_t usedobj, quotaobj;
+ uint64_t quota, used = 0;
+ timespec_t now;
+
+ usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ if (quotaobj == 0 || zfsvfs->z_replay) {
+ error = ENOENT;
+ goto done;
+ }
+ (void) sprintf(buf, "%llx", (longlong_t)id);
+ if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
+ buf, sizeof (quota), 1, &quota)) != 0) {
+ dprintf("%s(%d): quotaobj lookup failed\n",
+ __FUNCTION__, __LINE__);
+ goto done;
+ }
+ /*
+ * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
+ * So we set them to be the same.
+ */
+ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
+ error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
+ if (error && error != ENOENT) {
+ dprintf("%s(%d): usedobj failed; %d\n",
+ __FUNCTION__, __LINE__, error);
+ goto done;
+ }
+ dqp->dqb_curblocks = btodb(used);
+ dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
+ vfs_timestamp(&now);
+ /*
+ * Setting this to 0 causes FreeBSD quota(8) to print
+ * the number of days since the epoch, which isn't
+ * particularly useful.
+ */
+ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
+done:
+ return (error);
+}
+
+static int
+zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct thread *td;
+ int cmd, type, error = 0;
+ int bitsize;
+ zfs_userquota_prop_t quota_type;
+ struct dqblk64 dqblk = { 0 };
+
+ td = curthread;
+ cmd = cmds >> SUBCMDSHIFT;
+ type = cmds & SUBCMDMASK;
+
+ ZFS_ENTER(zfsvfs);
+ if (id == -1) {
+ switch (type) {
+ case USRQUOTA:
+ id = td->td_ucred->cr_ruid;
+ break;
+ case GRPQUOTA:
+ id = td->td_ucred->cr_rgid;
+ break;
+ default:
+ error = EINVAL;
+ if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
+ vfs_unbusy(vfsp);
+ goto done;
+ }
+ }
+ /*
+ * Map BSD type to:
+ * ZFS_PROP_USERUSED,
+ * ZFS_PROP_USERQUOTA,
+ * ZFS_PROP_GROUPUSED,
+ * ZFS_PROP_GROUPQUOTA
+ */
+ switch (cmd) {
+ case Q_SETQUOTA:
+ case Q_SETQUOTA32:
+ if (type == USRQUOTA)
+ quota_type = ZFS_PROP_USERQUOTA;
+ else if (type == GRPQUOTA)
+ quota_type = ZFS_PROP_GROUPQUOTA;
+ else
+ error = EINVAL;
+ break;
+ case Q_GETQUOTA:
+ case Q_GETQUOTA32:
+ if (type == USRQUOTA)
+ quota_type = ZFS_PROP_USERUSED;
+ else if (type == GRPQUOTA)
+ quota_type = ZFS_PROP_GROUPUSED;
+ else
+ error = EINVAL;
+ break;
+ }
+
+ /*
+ * Depending on the cmd, we may need to get
+ * the ruid and domain (see fuidstr_to_sid?),
+ * the fuid (how?), or other information.
+ * Create fuid using zfs_fuid_create(zfsvfs, id,
+ * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
+ * I think I can use just the id?
+ *
+ * Look at zfs_id_overquota() to look up a quota.
+ * zap_lookup(something, quotaobj, fuidstring,
+ * sizeof (long long), 1, &quota)
+ *
+ * See zfs_set_userquota() to set a quota.
+ */
+ if ((uint32_t)type >= MAXQUOTAS) {
+ error = EINVAL;
+ goto done;
+ }
+
+ switch (cmd) {
+ case Q_GETQUOTASIZE:
+ bitsize = 64;
+ error = copyout(&bitsize, arg, sizeof (int));
+ break;
+ case Q_QUOTAON:
+ // As far as I can tell, you can't turn quotas on or off on zfs
+ error = 0;
+ vfs_unbusy(vfsp);
+ break;
+ case Q_QUOTAOFF:
+ error = ENOTSUP;
+ vfs_unbusy(vfsp);
+ break;
+ case Q_SETQUOTA:
+ error = copyin(&dqblk, arg, sizeof (dqblk));
+ if (error == 0)
+ error = zfs_set_userquota(zfsvfs, quota_type,
+ "", id, dbtob(dqblk.dqb_bhardlimit));
+ break;
+ case Q_GETQUOTA:
+ error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
+ if (error == 0)
+ error = copyout(&dqblk, arg, sizeof (dqblk));
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+done:
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+ return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
+}
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor)
+{
+
+ /*
+ * Data integrity is job one. We don't want a compromised kernel
+ * writing to the storage pool, so we never sync during panic.
+ */
+ if (panicstr)
+ return (0);
+
+ /*
+ * Ignore the system syncher. ZFS already commits async data
+ * at zfs_txg_timeout intervals.
+ */
+ if (waitfor == MNT_LAZY)
+ return (0);
+
+ if (vfsp != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ dsl_pool_t *dp;
+ int error;
+
+ error = vfs_stdsync(vfsp, waitfor);
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+ dp = dmu_objset_pool(zfsvfs->z_os);
+
+ /*
+ * If the system is shutting down, then skip any
+ * filesystems which may exist on a suspended pool.
+ */
+ if (rebooting && spa_suspended(dp->dp_spa)) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, 0);
+
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1M). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ zfsvfs->z_atime = TRUE;
+ zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+ } else {
+ zfsvfs->z_atime = FALSE;
+ zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+ }
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == ZFS_XATTR_OFF) {
+ zfsvfs->z_flags &= ~ZSB_XATTR;
+ } else {
+ zfsvfs->z_flags |= ZSB_XATTR;
+
+ if (newval == ZFS_XATTR_SA)
+ zfsvfs->z_xattr_sa = B_TRUE;
+ else
+ zfsvfs->z_xattr_sa = B_FALSE;
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ zfsvfs->z_max_blksz = newval;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval) {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+ }
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+ }
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+ }
+}
+
+/*
+ * The nbmand mount option can be changed at mount time.
+ * We can't allow it to be toggled on live file systems or incorrect
+ * behavior may be seen from cifs clients
+ *
+ * This property isn't registered via dsl_prop_register(), but this callback
+ * will be called when a file system is first mounted
+ */
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ if (newval == FALSE) {
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
+ } else {
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
+ }
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ uint64_t nbmand;
+ boolean_t readonly = B_FALSE;
+ boolean_t do_readonly = B_FALSE;
+ boolean_t setuid = B_FALSE;
+ boolean_t do_setuid = B_FALSE;
+ boolean_t exec = B_FALSE;
+ boolean_t do_exec = B_FALSE;
+ boolean_t xattr = B_FALSE;
+ boolean_t atime = B_FALSE;
+ boolean_t do_atime = B_FALSE;
+ boolean_t do_xattr = B_FALSE;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * This function can be called for a snapshot when we update snapshot's
+ * mount point, which isn't really supported.
+ */
+ if (dmu_objset_is_snapshot(os))
+ return (EOPNOTSUPP);
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+ !spa_writeable(dmu_objset_spa(os))) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
+ zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
+ do_xattr = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
+ atime = B_FALSE;
+ do_atime = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
+ atime = B_TRUE;
+ do_atime = B_TRUE;
+ }
+
+ /*
+ * We need to enter pool configuration here, so that we can use
+ * dsl_prop_get_int_ds() to handle the special nbmand property below.
+ * dsl_prop_get_integer() can not be used, because it has to acquire
+ * spa_namespace_lock and we can not do that because we already hold
+ * z_teardown_lock. The problem is that spa_write_cachefile() is called
+ * with spa_namespace_lock held and the function calls ZFS vnode
+ * operations to write the cache file and thus z_teardown_lock is
+ * acquired after spa_namespace_lock.
+ */
+ ds = dmu_objset_ds(os);
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+ /*
+ * nbmand is a special property. It can only be changed at
+ * mount time.
+ *
+ * This is weird, but it is documented to only be changeable
+ * at mount time.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
+ nbmand = B_FALSE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
+ nbmand = B_TRUE;
+ } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ return (error);
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ error = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+ zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_xattr)
+ xattr_changed_cb(zfsvfs, xattr);
+ if (do_atime)
+ atime_changed_cb(zfsvfs, atime);
+
+ nbmand_changed_cb(zfsvfs, nbmand);
+
+ return (0);
+
+unregister:
+ dsl_prop_unregister_all(ds, zfsvfs);
+ return (error);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+ uint64_t val;
+
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+ zfsvfs->z_os = os;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error != 0)
+ return (error);
+ if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printf("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_norm = (int)val;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_utf8 = (val != 0);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)val;
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ uint64_t sa_obj = 0;
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error != 0)
+ return (error);
+ }
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error != 0)
+ return (error);
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error != 0)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+ 8, 1, &zfsvfs->z_userquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+ 8, 1, &zfsvfs->z_groupquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+ 8, 1, &zfsvfs->z_projectquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+ 8, 1, &zfsvfs->z_userobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+ 8, 1, &zfsvfs->z_groupobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+ 8, 1, &zfsvfs->z_projectobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error == ENOENT)
+ zfsvfs->z_fuid_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+ &zfsvfs->z_shares_dir);
+ if (error == ENOENT)
+ zfsvfs->z_shares_dir = 0;
+ else if (error != 0)
+ return (error);
+
+ /*
+ * Only use the name cache if we are looking for a
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name (which is always the case on
+ * FreeBSD).
+ */
+ zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+ ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+ return (0);
+}
+
+taskq_t *zfsvfs_taskq;
+
+static void
+zfsvfs_task_unlinked_drain(void *context, int pending __unused)
+{
+
+ zfs_unlinked_drain((zfsvfs_t *)context);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ int error;
+ boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+ /*
+ * XXX: Fix struct statfs so this isn't necessary!
+ *
+ * The 'osname' is used as the filesystem's special node, which means
+ * it must fit in statfs.f_mntfromname, or else it can't be
+ * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+ * 'zfs unmount' to think it's not mounted when it is.
+ */
+ if (strlen(osname) >= MNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
+ &os);
+ if (error != 0) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+
+ return (error);
+}
+
+
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_parent = zfsvfs;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
+ zfsvfs_task_unlinked_drain, zfsvfs);
+#ifdef DIAGNOSTIC
+ rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
+ rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
+ rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+ for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfsvfs_init(zfsvfs, os);
+ if (error != 0) {
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ *zfvp = NULL;
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ *zfvp = zfsvfs;
+ return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+ int error;
+
+ /*
+ * Check for a bad on-disk format version now since we
+ * lied about owning the dataset readonly before.
+ */
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
+ return (SET_ERROR(EROFS));
+
+ error = zfs_register_callbacks(zfsvfs->z_vfs);
+ if (error)
+ return (error);
+
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+ /*
+ * If we are not mounting (ie: online recv), then we don't
+ * have to worry about replaying the log as we blocked all
+ * operations out since we closed the ZIL.
+ */
+ if (mounting) {
+ boolean_t readonly;
+
+ /*
+ * During replay we remove the read only flag to
+ * allow replays to succeed.
+ */
+ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+ if (readonly != 0) {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ } else {
+ dsl_dir_t *dd;
+
+ zfs_unlinked_drain(zfsvfs);
+ dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+ dd->dd_activity_cancelled = B_FALSE;
+ }
+
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ boolean_t use_nc = zfsvfs->z_use_namecache;
+ zfsvfs->z_use_namecache = B_FALSE;
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ zfsvfs->z_use_namecache = use_nc;
+ }
+ }
+
+ /* restore readonly bit */
+ if (readonly != 0)
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ }
+
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+ return (0);
+}
+
+extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+ int i;
+
+ /*
+ * This is a barrier to prevent the filesystem from going away in
+ * zfs_znode_move() until we can safely ensure that the filesystem is
+ * not unmounted. We consider the filesystem valid before the barrier
+ * and invalid after the barrier.
+ */
+ rw_enter(&zfsvfs_lock, RW_READER);
+ rw_exit(&zfsvfs_lock);
+
+ zfs_fuid_destroy(zfsvfs);
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ mutex_destroy(&zfsvfs->z_lock);
+ ASSERT(zfsvfs->z_nr_znodes == 0);
+ list_destroy(&zfsvfs->z_all_znodes);
+ rrm_destroy(&zfsvfs->z_teardown_lock);
+ rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+ rw_destroy(&zfsvfs->z_fuid_lock);
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ if (zfsvfs->z_vfs) {
+ if (zfsvfs->z_use_fuids) {
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ } else {
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ }
+ }
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname)
+{
+ uint64_t recordsize, fsid_guid;
+ int error = 0;
+ zfsvfs_t *zfsvfs;
+
+ ASSERT(vfsp);
+ ASSERT(osname);
+
+ error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
+ if (error)
+ return (error);
+ zfsvfs->z_vfs = vfsp;
+
+ if ((error = dsl_prop_get_integer(osname,
+ "recordsize", &recordsize, NULL)))
+ goto out;
+ zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
+
+ vfsp->vfs_data = zfsvfs;
+ vfsp->mnt_flag |= MNT_LOCAL;
+ vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+ vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
+ vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+ /*
+ * This can cause a loss of coherence between ARC and page cache
+ * on ZoF - unclear if the problem is in FreeBSD or ZoF
+ */
+ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
+ vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
+ vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
+
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ vfsp->vfs_fsid.val[0] = fsid_guid;
+ vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ (vfsp->mnt_vfc->vfc_typenum & 0xFF);
+
+ /*
+ * Set features for file system.
+ */
+ zfs_set_fuid_feature(zfsvfs);
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+ vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+ vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
+ } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+ vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+ vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+ }
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t pval;
+
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if ((error = dsl_prop_get_integer(osname,
+ "xattr", &pval, NULL)))
+ goto out;
+ xattr_changed_cb(zfsvfs, pval);
+ zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+ } else {
+ if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+ goto out;
+ }
+
+ vfs_mountedfrom(vfsp, osname);
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ } else {
+ atomic_inc_32(&zfs_active_fs_count);
+ }
+
+ return (error);
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (!dmu_objset_is_snapshot(os))
+ dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef SECLABEL
+/*
+ * Convert a decimal digit string to a uint64_t integer.
+ */
+static int
+str_to_uint64(char *str, uint64_t *objnum)
+{
+ uint64_t num = 0;
+
+ while (*str) {
+ if (*str < '0' || *str > '9')
+ return (SET_ERROR(EINVAL));
+
+ num = num*10 + *str++ - '0';
+ }
+
+ *objnum = num;
+ return (0);
+}
+
+/*
+ * The boot path passed from the boot loader is in the form of
+ * "rootpool-name/root-filesystem-object-number'. Convert this
+ * string to a dataset name: "rootpool-name/root-filesystem-name".
+ */
+static int
+zfs_parse_bootfs(char *bpath, char *outpath)
+{
+ char *slashp;
+ uint64_t objnum;
+ int error;
+
+ if (*bpath == 0 || *bpath == '/')
+ return (SET_ERROR(EINVAL));
+
+ (void) strcpy(outpath, bpath);
+
+ slashp = strchr(bpath, '/');
+
+ /* if no '/', just return the pool name */
+ if (slashp == NULL) {
+ return (0);
+ }
+
+ /* if not a number, just return the root dataset name */
+ if (str_to_uint64(slashp+1, &objnum)) {
+ return (0);
+ }
+
+ *slashp = '\0';
+ error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
+ *slashp = '/';
+
+ return (error);
+}
+
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+ /* must be readonly */
+ uint64_t rdonly;
+
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+ return (SET_ERROR(EACCES));
+ return (rdonly ? 0 : EACCES);
+ }
+ return (SET_ERROR(EACCES));
+}
+
+/*
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into. If the dataset has
+ * no label, create one.
+ *
+ * Returns 0 if access allowed, error otherwise (e.g. EACCES)
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+ int error, retv;
+ zone_t *mntzone = NULL;
+ ts_label_t *mnt_tsl;
+ bslabel_t *mnt_sl;
+ bslabel_t ds_sl;
+ char ds_hexsl[MAXNAMELEN];
+
+ retv = EACCES; /* assume the worst */
+
+ /*
+ * Start by getting the dataset label if it exists.
+ */
+ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error)
+ return (SET_ERROR(EACCES));
+
+ /*
+ * If labeling is NOT enabled, then disallow the mount of datasets
+ * which have a non-default label already. No other label checks
+ * are needed.
+ */
+ if (!is_system_labeled()) {
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ return (SET_ERROR(EACCES));
+ }
+
+ /*
+ * Get the label of the mountpoint. If mounting into the global
+ * zone (i.e. mountpoint is not within an active zone and the
+ * zoned property is off), the label must be default or
+ * admin_low/admin_high only; no other checks are needed.
+ */
+ mntzone = zone_find_by_any_path(vfsp->vfs_mntpt, B_FALSE);
+ if (mntzone->zone_id == GLOBAL_ZONEID) {
+ uint64_t zoned;
+
+ zone_rele(mntzone);
+
+ if (dsl_prop_get_integer(osname,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (SET_ERROR(EACCES));
+ if (!zoned)
+ return (zfs_check_global_label(osname, ds_hexsl));
+ else
+ /*
+ * This is the case of a zone dataset being mounted
+ * initially, before the zone has been fully created;
+ * allow this mount into global zone.
+ */
+ return (0);
+ }
+
+ mnt_tsl = mntzone->zone_slabel;
+ ASSERT(mnt_tsl != NULL);
+ label_hold(mnt_tsl);
+ mnt_sl = label2bslabel(mnt_tsl);
+
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+ /*
+ * The dataset doesn't have a real label, so fabricate one.
+ */
+ char *str = NULL;
+
+ if (l_to_str_internal(mnt_sl, &str) == 0 &&
+ dsl_prop_set_string(osname,
+ zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ ZPROP_SRC_LOCAL, str) == 0)
+ retv = 0;
+ if (str != NULL)
+ kmem_free(str, strlen(str) + 1);
+ } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+ /*
+ * Now compare labels to complete the MAC check. If the
+ * labels are equal then allow access. If the mountpoint
+ * label dominates the dataset label, allow readonly access.
+ * Otherwise, access is denied.
+ */
+ if (blequal(mnt_sl, &ds_sl))
+ retv = 0;
+ else if (bldominates(mnt_sl, &ds_sl)) {
+ vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+ retv = 0;
+ }
+ }
+
+ label_rele(mnt_tsl);
+ zone_rele(mntzone);
+ return (retv);
+}
+#endif /* SECLABEL */
+
+static int
+getpoolname(const char *osname, char *poolname)
+{
+ char *p;
+
+ p = strchr(osname, '/');
+ if (p == NULL) {
+ if (strlen(osname) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(poolname, osname);
+ } else {
+ if (p - osname >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(poolname, osname, p - osname);
+ poolname[p - osname] = '\0';
+ }
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp)
+{
+ kthread_t *td = curthread;
+ vnode_t *mvp = vfsp->mnt_vnodecovered;
+ cred_t *cr = td->td_ucred;
+ char *osname;
+ int error = 0;
+ int canwrite;
+
+ if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If full-owner-access is enabled and delegated administration is
+ * turned on, we must set nosuid.
+ */
+ if (zfs_super_owner &&
+ dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+ secpolicy_fs_mount_clearopts(cr, vfsp);
+ }
+
+ /*
+ * Check for mount privilege?
+ *
+ * If we don't have privilege then see if
+ * we have local permission to allow it
+ */
+ error = secpolicy_fs_mount(cr, mvp, vfsp);
+ if (error) {
+ if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
+ goto out;
+
+ if (!(vfsp->vfs_flag & MS_REMOUNT)) {
+ vattr_t vattr;
+
+ /*
+ * Make sure user is the owner of the mount point
+ * or has sufficient privileges.
+ */
+
+ vattr.va_mask = AT_UID;
+
+ vn_lock(mvp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(mvp, &vattr, cr)) {
+ VOP_UNLOCK1(mvp);
+ goto out;
+ }
+
+ if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+ VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+ VOP_UNLOCK1(mvp);
+ goto out;
+ }
+ VOP_UNLOCK1(mvp);
+ }
+
+ secpolicy_fs_mount_clearopts(cr, vfsp);
+ }
+
+ /*
+ * Refuse to mount a filesystem if we are in a local zone and the
+ * dataset is not visible.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+#ifdef SECLABEL
+ error = zfs_mount_label_policy(vfsp, osname);
+ if (error)
+ goto out;
+#endif
+
+ vfsp->vfs_flag |= MNT_NFS4ACLS;
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (vfsp->vfs_flag & MS_REMOUNT) {
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * Refresh mount options with z_teardown_lock blocking I/O while
+ * the filesystem is in an inconsistent state.
+ * The lock also serializes this code with filesystem
+ * manipulations between entry to zfs_suspend_fs() and return
+ * from zfs_resume_fs().
+ */
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ zfs_unregister_callbacks(zfsvfs);
+ error = zfs_register_callbacks(vfsp);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ goto out;
+ }
+
+ /* Initial root mount: try hard to import the requested root pool. */
+ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
+ (vfsp->vfs_flag & MNT_UPDATE) == 0) {
+ char pname[MAXNAMELEN];
+
+ error = getpoolname(osname, pname);
+ if (error == 0)
+ error = spa_import_rootpool(pname);
+ if (error)
+ goto out;
+ }
+ DROP_GIANT();
+ error = zfs_domount(vfsp, osname);
+ PICKUP_GIANT();
+
+out:
+ return (error);
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+ statp->f_version = STATFS_VERSION;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ /*
+ * The underlying storage pool actually uses multiple block sizes.
+ * We report the fragsize as the smallest block size we support,
+ * and we report our blocksize as the filesystem's maximum blocksize.
+ */
+ statp->f_bsize = SPA_MINBLOCKSIZE;
+ statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
+
+ /*
+ * The following report "total" blocks of various kinds in the
+ * file system, but reported in terms of f_frsize - the
+ * "fragment" size.
+ */
+
+ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
+ statp->f_bfree = availbytes / statp->f_bsize;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of object available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, statp->f_bfree);
+ statp->f_files = statp->f_ffree + usedobjs;
+
+ /*
+ * We're a zfs filesystem.
+ */
+ strlcpy(statp->f_fstypename, "zfs",
+ sizeof (statp->f_fstypename));
+
+ strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+ sizeof (statp->f_mntfromname));
+ strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+ sizeof (statp->f_mntonname));
+
+ statp->f_namemax = MAXNAMELEN - 1;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *vpp = ZTOV(rootzp);
+
+ ZFS_EXIT(zfsvfs);
+
+ if (error == 0) {
+ error = vn_lock(*vpp, flags);
+ if (error != 0) {
+ VN_RELE(*vpp);
+ *vpp = NULL;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+ znode_t *zp;
+ dsl_dir_t *dd;
+
+ /*
+ * If someone has not already unmounted this file system,
+ * drain the zrele_taskq to ensure all active references to the
+ * zfsvfs_t have been handled only then can it be safely destroyed.
+ */
+ if (zfsvfs->z_os) {
+ /*
+ * If we're unmounting we have to wait for the list to
+ * drain completely.
+ *
+ * If we're not unmounting there's no guarantee the list
+ * will drain completely, but zreles run from the taskq
+ * may add the parents of dir-based xattrs to the taskq
+ * so we want to wait for these.
+ *
+ * We can safely read z_nr_znodes without locking because the
+ * VFS has already blocked operations which add to the
+ * z_all_znodes list and thus increment z_nr_znodes.
+ */
+ int round = 0;
+ while (zfsvfs->z_nr_znodes > 0) {
+ taskq_wait_outstanding(dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ if (++round > 1 && !unmounting)
+ break;
+ }
+ }
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+ if (!unmounting) {
+ /*
+ * We purge the parent filesystem's vfsp as the parent
+ * filesystem and all of its snapshots have their vnode's
+ * v_vfsp set to the parent's filesystem's vfsp. Note,
+ * 'z_parent' is self referential for non-snapshots.
+ */
+#ifdef FREEBSD_NAMECACHE
+ cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
+#endif
+ }
+
+ /*
+ * Close the zil. NB: Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+ /*
+ * If we are not unmounting (ie: online recv) and someone already
+ * unmounted this file system while we were doing the switcheroo,
+ * or a reopen of z_os failed then just bail out now.
+ */
+ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * At this point there are no vops active, and any new vops will
+ * fail with EIO since we have z_teardown_lock for writer (only
+ * relavent for forced unmount).
+ *
+ * Release all holds on dbufs.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+ zp = list_next(&zfsvfs->z_all_znodes, zp))
+ if (zp->z_sa_hdl) {
+ ASSERT(ZTOV(zp)->v_count >= 0);
+ zfs_znode_dmu_fini(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * If we are unmounting, set the unmounted flag and let new vops
+ * unblock. zfs_inactive will have the unmounted behavior, and all
+ * other vops will fail with EIO.
+ */
+ if (unmounting) {
+ zfsvfs->z_unmounted = B_TRUE;
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ }
+
+ /*
+ * z_os will be NULL if there was an error in attempting to reopen
+ * zfsvfs, so just return as the properties had already been
+ * unregistered and cached data had been evicted before.
+ */
+ if (zfsvfs->z_os == NULL)
+ return (0);
+
+ /*
+ * Unregister properties.
+ */
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Evict cached data
+ */
+ if (!zfs_is_readonly(zfsvfs))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ dmu_objset_evict_dbufs(zfsvfs->z_os);
+ dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+ dsl_dir_cancel_waiters(dd);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag)
+{
+ kthread_t *td = curthread;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ objset_t *os;
+ cred_t *cr = td->td_ucred;
+ int ret;
+
+ ret = secpolicy_fs_unmount(cr, vfsp);
+ if (ret) {
+ if (dsl_deleg_access((char *)vfsp->vfs_resource,
+ ZFS_DELEG_PERM_MOUNT, cr))
+ return (ret);
+ }
+
+ /*
+ * Unmount any snapshots mounted under .zfs before unmounting the
+ * dataset itself.
+ */
+ if (zfsvfs->z_ctldir != NULL) {
+ if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+ return (ret);
+ }
+
+ if (fflag & MS_FORCE) {
+ /*
+ * Mark file system as unmounted before calling
+ * vflush(FORCECLOSE). This way we ensure no future vnops
+ * will be called and risk operating on DOOMED vnodes.
+ */
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ zfsvfs->z_unmounted = B_TRUE;
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ }
+
+ /*
+ * Flush all the files.
+ */
+ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+ if (ret != 0)
+ return (ret);
+ while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task, NULL) != 0)
+ taskqueue_drain(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task);
+
+ VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ os = zfsvfs->z_os;
+
+ /*
+ * z_os will be NULL if there was an error in
+ * attempting to reopen zfsvfs.
+ */
+ if (os != NULL) {
+ /*
+ * Unset the objset user_ptr.
+ */
+ mutex_enter(&os->os_user_ptr_lock);
+ dmu_objset_set_user(os, NULL);
+ mutex_exit(&os->os_user_ptr_lock);
+
+ /*
+ * Finally release the objset
+ */
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ }
+
+ /*
+ * We can now safely destroy the '.zfs' directory node.
+ */
+ if (zfsvfs->z_ctldir != NULL)
+ zfsctl_destroy(zfsvfs);
+ zfs_freevfs(vfsp);
+
+ return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ int err;
+
+ /*
+ * zfs_zget() can't operate on virtual entries like .zfs/ or
+ * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
+ * This will make NFS to switch to LOOKUP instead of using VGET.
+ */
+ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
+ (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
+ return (EOPNOTSUPP);
+
+ ZFS_ENTER(zfsvfs);
+ err = zfs_zget(zfsvfs, ino, &zp);
+ if (err == 0 && zp->z_unlinked) {
+ vrele(ZTOV(zp));
+ err = EINVAL;
+ }
+ if (err == 0)
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ if (err == 0) {
+ err = vn_lock(*vpp, flags);
+ if (err != 0)
+ vrele(*vpp);
+ }
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int **secflavors)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * If this is regular file system vfsp is the same as
+ * zfsvfs->z_parent->z_vfs, but if it is snapshot,
+ * zfsvfs->z_parent->z_vfs represents parent file system
+ * which we have to use here, because only this file system
+ * has mnt_export configured.
+ */
+ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
+ credanonp, numsecflavors, secflavors));
+}
+
+CTASSERT(SHORT_FID_LEN <= sizeof (struct fid));
+CTASSERT(LONG_FID_LEN <= sizeof (struct fid));
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
+{
+ struct componentname cn;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ vnode_t *dvp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *vpp = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * On FreeBSD we can get snapshot's mount point or its parent file
+ * system mount point depending if snapshot is already mounted or not.
+ */
+ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ ZFS_EXIT(zfsvfs);
+
+ err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+ if (err)
+ return (SET_ERROR(EINVAL));
+ ZFS_ENTER(zfsvfs);
+ }
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * A zero fid_gen means we are in .zfs or the .zfs/snapshot
+ * directory tree. If the object == zfsvfs->z_shares_dir, then
+ * we are in the .zfs/shares directory tree.
+ */
+ if ((fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
+ (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
+ ZFS_EXIT(zfsvfs);
+ VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN | LOCKLEAF;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
+ } else if (object == zfsvfs->z_shares_dir) {
+ /*
+ * XXX This branch must not be taken,
+ * if it is, then the lookup below will
+ * explode.
+ */
+ cn.cn_nameptr = "shares";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
+ } else {
+ *vpp = dvp;
+ }
+ return (err);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+ if ((err = zfs_zget(zfsvfs, object, &zp))) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+ vrele(ZTOV(zp));
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ err = vn_lock(*vpp, flags);
+ if (err == 0)
+ vnode_create_vobject(*vpp, zp->z_size, curthread);
+ else
+ *vpp = NULL;
+ return (err);
+}
+
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+ int error;
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended. Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ int err;
+ znode_t *zp;
+
+ ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ /*
+ * We already own this, so just update the objset_t, as the one we
+ * had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_config_exit(dp, FTAG);
+
+ err = zfsvfs_init(zfsvfs, os);
+ if (err != 0)
+ goto bail;
+
+ ds->ds_dir->dd_activity_cancelled = B_FALSE;
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ /*
+ * Attempt to re-establish all the active znodes with
+ * their dbufs. If a zfs_rezget() fails, then we'll let
+ * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+ * when they try to use their znode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ (void) zfs_rezget(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+bail:
+ /* release the VOPs */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+ if (err) {
+ /*
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
+ */
+ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+ vfs_ref(zfsvfs->z_vfs);
+ (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+ }
+ }
+ return (err);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ zfsvfs_free(zfsvfs);
+
+ atomic_dec_32(&zfs_active_fs_count);
+}
+
+#ifdef __i386__
+static int desiredvnodes_backup;
+#endif
+
+static void
+zfs_vnodes_adjust(void)
+{
+#ifdef __i386__
+ int newdesiredvnodes;
+
+ desiredvnodes_backup = desiredvnodes;
+
+ /*
+ * We calculate newdesiredvnodes the same way it is done in
+ * vntblinit(). If it is equal to desiredvnodes, it means that
+ * it wasn't tuned by the administrator and we can tune it down.
+ */
+ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
+ vm_kmem_size / (5 * (sizeof (struct vm_object) +
+ sizeof (struct vnode))));
+ if (newdesiredvnodes == desiredvnodes)
+ desiredvnodes = (3 * newdesiredvnodes) / 4;
+#endif
+}
+
+static void
+zfs_vnodes_adjust_back(void)
+{
+
+#ifdef __i386__
+ desiredvnodes = desiredvnodes_backup;
+#endif
+}
+
+void
+zfs_init(void)
+{
+
+ printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
+
+ /*
+ * Initialize .zfs directory structures
+ */
+ zfsctl_init();
+
+ /*
+ * Initialize znode cache, vnode ops, etc...
+ */
+ zfs_znode_init();
+
+ /*
+ * Reduce number of vnodes. Originally number of vnodes is calculated
+ * with UFS inode in mind. We reduce it here, because it's too big for
+ * ZFS/i386.
+ */
+ zfs_vnodes_adjust();
+
+ dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
+
+ zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
+}
+
+void
+zfs_fini(void)
+{
+ taskq_destroy(zfsvfs_taskq);
+ zfsctl_fini();
+ zfs_znode_fini();
+ zfs_vnodes_adjust_back();
+}
+
+int
+zfs_busy(void)
+{
+ return (zfs_active_fs_count != 0);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ /*
+ * We already own this, so just hold and rele it to update the
+ * objset_t, as the one we had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ dsl_pool_config_exit(dp, FTAG);
+ zfsvfs->z_os = os;
+
+ /* release the VOPs */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+ /*
+ * Try to force unmount this file system.
+ */
+ (void) zfs_umount(zfsvfs->z_vfs, 0);
+ zfsvfs->z_unmounted = B_TRUE;
+ return (0);
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+
+ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+ return (SET_ERROR(EINVAL));
+
+ if (newvers < zfsvfs->z_version)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (SET_ERROR(ENOTSUP));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &newvers, tx);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT0(error);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+ "from %lu to %lu", zfsvfs->z_version, newvers);
+
+ dmu_tx_commit(tx);
+
+ zfsvfs->z_version = newvers;
+ os->os_version = newvers;
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
+ pname = ZPL_VERSION_STR;
+ } else {
+ pname = zfs_prop_to_name(prop);
+ }
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
+/*
+ * Return true if the coresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+ zfsvfs_t *zfvp;
+ boolean_t unmounted = B_FALSE;
+
+ ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ if (zfvp != NULL && zfvp->z_vfs != NULL &&
+ (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
+ unmounted = B_TRUE;
+ mutex_exit(&os->os_user_ptr_lock);
+
+ return (unmounted);
+}
+
+#ifdef _KERNEL
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+ char tmpbuf[MAXPATHLEN];
+ struct mount *mp;
+ char *fromname;
+ size_t oldlen;
+
+ oldlen = strlen(oldname);
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ fromname = mp->mnt_stat.f_mntfromname;
+ if (strcmp(fromname, oldname) == 0) {
+ (void) strlcpy(fromname, newname,
+ sizeof (mp->mnt_stat.f_mntfromname));
+ continue;
+ }
+ if (strncmp(fromname, oldname, oldlen) == 0 &&
+ (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
+ (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
+ newname, fromname + oldlen);
+ (void) strlcpy(fromname, tmpbuf,
+ sizeof (mp->mnt_stat.f_mntfromname));
+ continue;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+#endif
diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c
new file mode 100644
index 000000000..d7b92035f
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_vnops.c
@@ -0,0 +1,6533 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vm.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/kdb.h>
+#include <sys/sysproto.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_rlock.h>
+#include <sys/extdirent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sched.h>
+#include <sys/acl.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_param.h>
+#include <sys/zil.h>
+#include <sys/zfs_vnops.h>
+
+#include <vm/vm_object.h>
+
+#include <sys/extattr.h>
+#include <sys/priv.h>
+
+#ifndef VN_OPEN_INVFS
+#define VN_OPEN_INVFS 0x0
+#endif
+
+#if __FreeBSD_version >= 1300047
+#define vm_page_wire_lock(pp)
+#define vm_page_wire_unlock(pp)
+#else
+#define vm_page_wire_lock(pp) vm_page_lock(pp)
+#define vm_page_wire_unlock(pp) vm_page_unlock(pp)
+#endif
+
+static int
+zfs_u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+
+ return (u8_validate(__DECONST(char *, u8str), n, list, flag, errnum));
+}
+#define u8_validate zfs_u8_validate
+
+#ifdef DEBUG_VFS_LOCKS
+#define VNCHECKREF(vp) \
+ VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \
+ ("%s: wrong ref counts", __func__));
+#else
+#define VNCHECKREF(vp)
+#endif
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * can return EIO from the calling function.
+ *
+ * (2) VN_RELE() should always be the last thing except for zil_commit()
+ * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which acquires range locks) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ * dmu_tx_assign(). This is critical because we don't want to block
+ * while holding locks.
+ *
+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * reduces lock contention and CPU usage when we must wait (note that if
+ * throughput is constrained by the storage, nearly every transaction
+ * must wait).
+ *
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing
+ * to use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ * to indicate that this operation has already called dmu_tx_wait().
+ * This will ensure that we don't retry forever, waiting a short bit
+ * each time.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ * During ZIL replay the zfs_log_* functions will update the sequence
+ * number to indicate the zil transaction has replayed.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * if (error == ERESTART) {
+ * waited = B_TRUE;
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * zil_commit(zilog, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(*vpp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+ ((flag & FAPPEND) == 0)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+ ZTOV(zp)->v_type == VREG &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+ if (fs_vscan(*vpp, cr, 0) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+ }
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & (FSYNC | FDSYNC))
+ atomic_inc_32(&zp->z_sync_cnt);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Decrement the synchronous opens in the znode */
+ if ((flag & (FSYNC | FDSYNC)) && (count == 1))
+ atomic_dec_32(&zp->z_sync_cnt);
+
+ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+ ZTOV(zp)->v_type == VREG &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+ VERIFY(fs_vscan(vp, cr, 1) == 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off)
+{
+ znode_t *zp = VTOZ(vp);
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_size;
+ if (noff >= file_sz) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (cmd == _FIO_SEEK_HOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+
+ if (error == ESRCH)
+ return (SET_ERROR(ENXIO));
+
+ /* file was dirty, so fall back to using generic logic */
+ if (error == EBUSY) {
+ if (hole)
+ *off = file_sz;
+
+ return (0);
+ }
+
+ /*
+ * We could find a hole that begins after the logical end-of-file,
+ * because dmu_offset_next() only works on whole blocks. If the
+ * EOF falls mid-block, then indicate that the "virtual hole"
+ * at the end of the file begins at the logical EOF, rather than
+ * at the end of the last block.
+ */
+ if (noff > file_sz) {
+ ASSERT(hole);
+ noff = file_sz;
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
+ int *rvalp)
+{
+ offset_t off;
+ int error;
+ zfsvfs_t *zfsvfs;
+ znode_t *zp;
+
+ switch (com) {
+ case _FIOFFS:
+ {
+ return (0);
+
+ /*
+ * The following two ioctls are used by bfu. Faking out,
+ * necessary to avoid bfu errors.
+ */
+ }
+ case _FIOGDIO:
+ case _FIOSDIO:
+ {
+ return (0);
+ }
+
+ case _FIO_SEEK_DATA:
+ case _FIO_SEEK_HOLE:
+ {
+ off = *(offset_t *)data;
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* offset parameter is in/out */
+ error = zfs_holey(vp, com, &off);
+ ZFS_EXIT(zfsvfs);
+ if (error)
+ return (error);
+ *(offset_t *)data = off;
+ return (0);
+ }
+ }
+ return (SET_ERROR(ENOTTY));
+}
+
+static vm_page_t
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t end;
+
+ /*
+ * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+ * aligned boundaries, if the range is not aligned. As a result a
+ * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+ * It may happen that all DEV_BSIZE subranges are marked clean and thus
+ * the whole page would be considred clean despite have some dirty data.
+ * For this reason we should shrink the range to DEV_BSIZE aligned
+ * boundaries before calling vm_page_clear_dirty.
+ */
+ end = rounddown2(off + nbytes, DEV_BSIZE);
+ off = roundup2(off, DEV_BSIZE);
+ nbytes = end - off;
+
+ obj = vp->v_object;
+ zfs_vmobject_assert_wlocked(obj);
+#if __FreeBSD_version < 1300050
+ for (;;) {
+ if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ pp->valid) {
+ if (vm_page_xbusied(pp)) {
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_reference(pp);
+ vm_page_lock(pp);
+ zfs_vmobject_wunlock(obj);
+ vm_page_busy_sleep(pp, "zfsmwb", true);
+ zfs_vmobject_wlock(obj);
+ continue;
+ }
+ vm_page_sbusy(pp);
+ } else if (pp != NULL) {
+ ASSERT(!pp->valid);
+ pp = NULL;
+ }
+ if (pp != NULL) {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_object_pip_add(obj, 1);
+ pmap_remove_write(pp);
+ if (nbytes != 0)
+ vm_page_clear_dirty(pp, off, nbytes);
+ }
+ break;
+ }
+#else
+ vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
+ VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+ if (pp != NULL) {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_object_pip_add(obj, 1);
+ pmap_remove_write(pp);
+ if (nbytes != 0)
+ vm_page_clear_dirty(pp, off, nbytes);
+ }
+#endif
+ return (pp);
+}
+
+static void
+page_unbusy(vm_page_t pp)
+{
+
+ vm_page_sunbusy(pp);
+#if __FreeBSD_version >= 1300041
+ vm_object_pip_wakeup(pp->object);
+#else
+ vm_object_pip_subtract(pp->object, 1);
+#endif
+}
+
+#if __FreeBSD_version > 1300051
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+ vm_object_t obj;
+ vm_page_t m;
+
+ obj = vp->v_object;
+ zfs_vmobject_assert_wlocked(obj);
+
+ vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
+ VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY);
+ return (m);
+}
+#else
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+ vm_object_t obj;
+ vm_page_t pp;
+
+ obj = vp->v_object;
+ zfs_vmobject_assert_wlocked(obj);
+
+ for (;;) {
+ if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ pp->valid) {
+ if (vm_page_xbusied(pp)) {
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_reference(pp);
+ vm_page_lock(pp);
+ zfs_vmobject_wunlock(obj);
+ vm_page_busy_sleep(pp, "zfsmwb", true);
+ zfs_vmobject_wlock(obj);
+ continue;
+ }
+
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_wire_lock(pp);
+ vm_page_hold(pp);
+ vm_page_wire_unlock(pp);
+
+ } else
+ pp = NULL;
+ break;
+ }
+ return (pp);
+}
+#endif
+
+static void
+page_unhold(vm_page_t pp)
+{
+
+ vm_page_wire_lock(pp);
+#if __FreeBSD_version >= 1300035
+ vm_page_unwire(pp, PQ_ACTIVE);
+#else
+ vm_page_unhold(pp);
+#endif
+ vm_page_wire_unlock(pp);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ */
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
+ int segflg, dmu_tx_t *tx)
+{
+ vm_object_t obj;
+ struct sf_buf *sf;
+ caddr_t va;
+ int off;
+
+ ASSERT(segflg != UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ off = start & PAGEOFFSET;
+ zfs_vmobject_wlock(obj);
+#if __FreeBSD_version >= 1300041
+ vm_object_pip_add(obj, 1);
+#endif
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ vm_page_t pp;
+ int nbytes = imin(PAGESIZE - off, len);
+
+ if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+ zfs_vmobject_wunlock(obj);
+
+ va = zfs_map_page(pp, &sf);
+ (void) dmu_read(os, oid, start+off, nbytes,
+ va+off, DMU_READ_PREFETCH);
+ zfs_unmap_page(sf);
+
+ zfs_vmobject_wlock(obj);
+ page_unbusy(pp);
+ }
+ len -= nbytes;
+ off = 0;
+ }
+#if __FreeBSD_version >= 1300041
+ vm_object_pip_wakeup(obj);
+#else
+ vm_object_pip_wakeupn(obj, 0);
+#endif
+ zfs_vmobject_wunlock(obj);
+}
+
+/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain exclusive busy on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+static int
+mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ struct sf_buf *sf;
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t start;
+ caddr_t va;
+ int len = nbytes;
+ int error = 0;
+
+ ASSERT(uio->uio_segflg == UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+ ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
+
+ zfs_vmobject_wlock(obj);
+ for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
+ int bytes = MIN(PAGESIZE, len);
+
+ pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+ if (vm_page_none_valid(pp)) {
+ zfs_vmobject_wunlock(obj);
+ va = zfs_map_page(pp, &sf);
+ error = dmu_read(os, zp->z_id, start, bytes, va,
+ DMU_READ_PREFETCH);
+ if (bytes != PAGESIZE && error == 0)
+ bzero(va + bytes, PAGESIZE - bytes);
+ zfs_unmap_page(sf);
+ zfs_vmobject_wlock(obj);
+ vm_page_do_sunbusy(pp);
+#if __FreeBSD_version >= 1300047 && __FreeBSD_version < 1300051
+#error "unsupported version window"
+#elif __FreeBSD_version >= 1300051
+ if (error == 0) {
+ vm_page_valid(pp);
+ vm_page_lock(pp);
+ vm_page_activate(pp);
+ vm_page_unlock(pp);
+ }
+ vm_page_do_sunbusy(pp);
+ if (error != 0 && !vm_page_wired(pp) == 0 &&
+ pp->valid == 0 && vm_page_tryxbusy(pp))
+ vm_page_free(pp);
+#else
+ vm_page_lock(pp);
+ if (error) {
+ if (pp->wire_count == 0 && pp->valid == 0 &&
+ !vm_page_busied(pp))
+ vm_page_free(pp);
+ } else {
+ pp->valid = VM_PAGE_BITS_ALL;
+ vm_page_activate(pp);
+ }
+ vm_page_unlock(pp);
+#endif
+ } else {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_do_sunbusy(pp);
+ }
+ if (error)
+ break;
+ uio->uio_resid -= bytes;
+ uio->uio_offset += bytes;
+ len -= bytes;
+ }
+ zfs_vmobject_wunlock(obj);
+ return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ vm_object_t obj;
+ int64_t start;
+ int len = nbytes;
+ int off;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ zfs_vmobject_wlock(obj);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ vm_page_t pp;
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+
+ if ((pp = page_hold(vp, start))) {
+ struct sf_buf *sf;
+ caddr_t va;
+
+ zfs_vmobject_wunlock(obj);
+ va = zfs_map_page(pp, &sf);
+ error = vn_io_fault_uiomove(va + off, bytes, uio);
+ zfs_unmap_page(sf);
+ zfs_vmobject_wlock(obj);
+ page_unhold(pp);
+ } else {
+ zfs_vmobject_wunlock(obj);
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, bytes);
+ zfs_vmobject_wlock(obj);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ zfs_vmobject_wunlock(obj);
+ return (error);
+}
+
+offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: vp - vnode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - SYNC flags; used to provide FRSYNC semantics.
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Side Effects:
+ * vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ssize_t n, nbytes;
+ int error = 0;
+ zfs_locked_range_t *lr;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ */
+ if (zfsvfs->z_log &&
+ (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+ zil_commit(zfsvfs->z_log, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset,
+ uio->uio_resid, RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(uio->uio_loffset < zp->z_size);
+ n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+
+ while (n > 0) {
+ nbytes = MIN(n, zfs_read_chunk_size -
+ P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+ if (uio->uio_segflg == UIO_NOCOPY)
+ error = mappedread_sf(vp, nbytes, uio);
+ else if (vn_has_cached_data(vp)) {
+ error = mappedread(vp, nbytes, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes);
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+
+ n -= nbytes;
+ }
+out:
+ zfs_rangelock_exit(lr);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: vp - vnode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
+ * set if in append mode.
+ * cr - credentials of caller.
+ * ct - caller context (NFS/CIFS fem monitor only)
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+static int
+zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ rlim64_t limit = MAXOFFSET_T;
+ ssize_t start_resid = uio->uio_resid;
+ ssize_t tx_bytes;
+ uint64_t end_size;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog;
+ offset_t woff;
+ ssize_t n, nbytes;
+ zfs_locked_range_t *lr;
+ int max_blksz = zfsvfs->z_max_blksz;
+ int error = 0;
+ arc_buf_t *abuf;
+ iovec_t *aiov = NULL;
+ xuio_t *xuio = NULL;
+ int i_iov = 0;
+ int iovcnt __unused = uio->uio_iovcnt;
+ iovec_t *iovp = uio->uio_iov;
+ int write_eof;
+ int count = 0;
+ sa_bulk_attr_t bulk[4];
+ uint64_t mtime[2], ctime[2];
+ uint64_t uid, gid, projid;
+
+ /*
+ * Fasttrack empty write
+ */
+ n = start_resid;
+ if (n == 0)
+ return (0);
+
+ if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+ limit = MAXOFFSET_T;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM.
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common()
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Validate file offset
+ */
+ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ if (ioflag & FAPPEND) {
+ /*
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+ woff = lr->lr_offset;
+ if (lr->lr_length == UINT64_MAX) {
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
+ }
+ uio->uio_loffset = woff;
+ } else {
+ /*
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+ }
+
+ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (EFBIG);
+ }
+
+ if (woff >= limit) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ if ((woff + n) > limit || woff > (limit - n))
+ n = limit - woff;
+
+ /* Will this write extend the file length? */
+ write_eof = (woff + n > zp->z_size);
+
+ end_size = MAX(zp->z_size, woff + n);
+
+ uid = zp->z_uid;
+ gid = zp->z_gid;
+ projid = zp->z_projid;
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ woff = uio->uio_loffset;
+
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
+ (projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ abuf = NULL;
+ if (xuio) {
+ ASSERT(i_iov < iovcnt);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+ iovec_t *, aiov, arc_buf_t *, abuf);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+ } else if (n >= max_blksz &&
+ woff >= zp->z_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
+ size_t cbytes;
+
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
+ ASSERT(abuf != NULL);
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if ((error = uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes))) {
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+ ASSERT(cbytes == max_blksz);
+ }
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+ MIN(n, max_blksz));
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+
+ /*
+ * If zfs_range_lock() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since zfs_range_reduce() will
+ * shrink down r_len to the appropriate size.
+ */
+ if (lr->lr_length == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_rangelock_reduce(lr, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+ if (woff + nbytes > zp->z_size)
+ vnode_pager_setsize(vp, woff + nbytes);
+
+ if (abuf == NULL) {
+ tx_bytes = uio->uio_resid;
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
+ tx_bytes -= uio->uio_resid;
+ } else {
+ tx_bytes = nbytes;
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff,
+ abuf, tx);
+ }
+ ASSERT(tx_bytes <= uio->uio_resid);
+ uioskip(uio, tx_bytes);
+ }
+ if (tx_bytes && vn_has_cached_data(vp)) {
+ update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
+ zp->z_id, uio->uio_segflg, tx);
+ }
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the excute bits is set.
+ *
+ * It would be nice to to this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(vp, cr,
+ (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
+ uio->uio_loffset);
+ ASSERT(error == 0 || error == EFAULT);
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ if (error == 0)
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ else
+ (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+ ioflag, NULL, NULL);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT(tx_bytes == nbytes);
+ n -= nbytes;
+
+ }
+
+ zfs_rangelock_exit(lr);
+
+ /*
+ * If we're in replay mode, or we made no progress, return error.
+ * Otherwise, it's at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * EFAULT means that at least one page of the source buffer was not
+ * available. VFS will re-try remaining I/O upon this error.
+ */
+ if (error == EFAULT) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (ioflag & (FSYNC | FDSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+ loff_t pos, size_t *presid)
+{
+ int error = 0;
+ ssize_t resid;
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
+ UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
+
+ if (error) {
+ return (SET_ERROR(error));
+ } else if (presid == NULL) {
+ if (resid != 0) {
+ error = SET_ERROR(EIO);
+ }
+ } else {
+ *presid = resid;
+ }
+ return (error);
+}
+
+void
+zfs_get_done(zgd_t *zgd, int error)
+{
+ znode_t *zp = zgd->zgd_private;
+ objset_t *os = zp->z_zfsvfs->z_os;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_rangelock_exit(zgd->zgd_lr);
+
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+#ifdef DEBUG
+static int zil_fault_io = 0;
+#endif
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error = 0;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
+ return (SET_ERROR(ENOENT));
+ if (zp->z_unlinked) {
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_zrele_taskq(dmu_objset_pool(os)));
+ return (SET_ERROR(ENOENT));
+ }
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zp;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
+ size, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (offset >= zp->z_size) {
+ error = SET_ERROR(ENOENT);
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ }
+ ASSERT(error == 0 || error == ENOENT);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and its checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ if (zp->z_blksz == size)
+ break;
+ offset += blkoff;
+ zfs_rangelock_exit(zgd->zgd_lr);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (lr->lr_offset >= zp->z_size)
+ error = SET_ERROR(ENOENT);
+#ifdef DEBUG
+ if (zil_fault_io) {
+ error = SET_ERROR(EIO);
+ zil_fault_io = 0;
+ }
+#endif
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= size);
+
+ /*
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
+ */
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP,
+ * so there's no need to zio_flush() its
+ * vdevs (flushing would needlesly hurt
+ * performance, and doesn't work on
+ * indirect vdevs).
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
+ error = 0;
+ }
+ }
+ }
+
+ zfs_get_done(zgd, error);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (flag & V_ACE_MASK)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+ else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+ int error;
+
+ *vpp = arg;
+ error = vn_lock(*vpp, lkflags);
+ if (error != 0)
+ vrele(*vpp);
+ return (error);
+}
+
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
+ int error;
+ int ltype;
+
+ if (zfsvfs->z_replay == B_FALSE)
+ ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+ if ((zdp->z_pflags & ZFS_XATTR) == 0)
+ VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ ASSERT3P(dvp, ==, vp);
+ vref(dvp);
+ ltype = lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /*
+ * Relock for the "." case could leave us with
+ * reclaimed vnode.
+ */
+ if (VN_IS_DOOMED(dvp)) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ /*
+ * Note that in this case, dvp is the child vnode, and we
+ * are looking up the parent vnode - exactly reverse from
+ * normal operation. Unlocking dvp requires some rather
+ * tricky unlock/relock dance to prevent mp from being freed;
+ * use vn_vget_ino_gen() which takes care of all that.
+ *
+ * XXX Note that there is a time window when both vnodes are
+ * unlocked. It is possible, although highly unlikely, that
+ * during that window the parent-child relationship between
+ * the vnodes may change, for example, get reversed.
+ * In that case we would have a wrong lock order for the vnodes.
+ * All other filesystems seem to ignore this problem, so we
+ * do the same here.
+ * A potential solution could be implemented as follows:
+ * - using LK_NOWAIT when locking the second vnode and retrying
+ * if necessary
+ * - checking that the parent-child relationship still holds
+ * after locking both vnodes and retrying if it doesn't
+ */
+ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+ return (error);
+ } else {
+ error = vn_lock(vp, lkflags);
+ if (error != 0)
+ vrele(vp);
+ return (error);
+ }
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ * IN: dvp - vnode of directory to search.
+ * nm - name of entry to lookup.
+ * pnp - full pathname to lookup [UNUSED].
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * rdir - root directory vnode [UNUSED].
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: vpp - vnode of located entry, NULL if not found.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
+ int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached)
+{
+ znode_t *zdp = VTOZ(dvp);
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error = 0;
+
+ /*
+ * Fast path lookup, however we must skip DNLC lookup
+ * for case folding or normalizing lookups because the
+ * DNLC code only stores the passed in name. This means
+ * creating 'a' and removing 'A' on a case insensitive
+ * file system would work, but DNLC still thinks 'a'
+ * exists and won't let you create it again on the next
+ * pass through fast path.
+ */
+ if (!(flags & LOOKUP_XATTR)) {
+ if (dvp->v_type != VDIR) {
+ return (SET_ERROR(ENOTDIR));
+ } else if (zdp->z_sa_hdl == NULL) {
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zdp);
+
+ *vpp = NULL;
+
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * If the xattr property is off, refuse the lookup request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ *vpp = ZTOV(zp);
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr);
+ if (error) {
+ vrele(ZTOV(zp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Check accessibility of directory if we're not coming in via
+ * VOP_CACHEDLOOKUP.
+ */
+ if (!cached) {
+#ifdef NOEXECCHECK
+ if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+ cnp->cn_flags &= ~NOEXECCHECK;
+ } else
+#endif
+ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+
+ /*
+ * First handle the special cases.
+ */
+ if ((cnp->cn_flags & ISDOTDOT) != 0) {
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+ struct componentname cn;
+ vnode_t *zfsctl_vp;
+ int ltype;
+
+ ZFS_EXIT(zfsvfs);
+ ltype = VOP_ISLOCKED(dvp);
+ VOP_UNLOCK1(dvp);
+ error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+ &zfsctl_vp);
+ if (error == 0) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = cnp->cn_nameiop;
+ cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+ cn.cn_lkflags = cnp->cn_lkflags;
+ error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+ vput(zfsctl_vp);
+ }
+ vn_lock(dvp, ltype | LK_RETRY);
+ return (error);
+ }
+ }
+ if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+ ZFS_EXIT(zfsvfs);
+ if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+ error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+ return (error);
+ }
+
+ /*
+ * The loop is retry the lookup if the parent-child relationship
+ * changes during the dot-dot locking complexities.
+ */
+ for (;;) {
+ uint64_t parent;
+
+ error = zfs_dirlook(zdp, nm, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+
+ ZFS_EXIT(zfsvfs);
+ if (error != 0)
+ break;
+
+ error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+ if (error != 0) {
+ /*
+ * If we've got a locking error, then the vnode
+ * got reclaimed because of a force unmount.
+ * We never enter doomed vnodes into the name cache.
+ */
+ *vpp = NULL;
+ return (error);
+ }
+
+ if ((cnp->cn_flags & ISDOTDOT) == 0)
+ break;
+
+ ZFS_ENTER(zfsvfs);
+ if (zdp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ } else {
+ error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ }
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ vput(ZTOV(zp));
+ break;
+ }
+ if (zp->z_id == parent) {
+ ZFS_EXIT(zfsvfs);
+ break;
+ }
+ vput(ZTOV(zp));
+ }
+
+ if (error != 0)
+ *vpp = NULL;
+
+ /* Translate errors and add SAVENAME when needed. */
+ if (cnp->cn_flags & ISLASTCN) {
+ switch (nameiop) {
+ case CREATE:
+ case RENAME:
+ if (error == ENOENT) {
+ error = EJUSTRETURN;
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DELETE:
+ if (error == 0)
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ }
+
+ /* Insert name into cache (as non-existent) if appropriate. */
+ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+ error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(dvp, NULL, cnp);
+
+ /* Insert name into cache if appropriate. */
+ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ if (!(cnp->cn_flags & ISLASTCN) ||
+ (nameiop != DELETE && nameiop != RENAME)) {
+ cache_enter(dvp, *vpp, cnp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the vp of the created or trunc'd file.
+ *
+ * IN: dvp - vnode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - large file flag [UNUSED].
+ * ct - caller context
+ * vsecp - ACL to be set
+ *
+ * OUT: vpp - vnode of created or trunc'd entry.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated if new entry created
+ * vp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode,
+ znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ objset_t *os;
+ dmu_tx_t *tx;
+ int error;
+ ksid_t *ksid;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype;
+#ifdef DEBUG_VFS_LOCKS
+ vnode_t *dvp = ZTOV(dzp);
+#endif
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ ksid = crgetsid(cr, KSID_OWNER);
+ if (ksid)
+ uid = ksid_getid(ksid);
+ else
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || (vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & AT_XVATTR) {
+ if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_type)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ *zpp = NULL;
+
+ if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
+ vap->va_mode &= ~S_ISVTX;
+
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+
+ if ((dzp->z_pflags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+ projid = zfs_inherit_projid(dzp);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ getnewvnode_reserve_();
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+out:
+ VNCHECKREF(dvp);
+ if (error == 0) {
+ *zpp = zp;
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dvp - vnode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime
+ * vp - ctime (if nlink > 0)
+ */
+
+/*ARGSUSED*/
+static int
+zfs_remove_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp;
+ znode_t *xzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t xattr_obj;
+ uint64_t obj = 0;
+ dmu_tx_t *tx;
+ boolean_t unlinked;
+ uint64_t txtype;
+ int error;
+
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zp = VTOZ(vp);
+ ZFS_VERIFY_ZP(zp);
+ zilog = zfsvfs->z_log;
+
+ xattr_obj = 0;
+ xzp = NULL;
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ /*
+ * Need to use rmdir for removing directories.
+ */
+ if (vp->v_type == VDIR) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ vnevent_remove(vp, dvp, name, ct);
+
+ obj = zp->z_id;
+
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ }
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the vnode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+
+ if (xzp) {
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ /*
+ * Mark this transaction as typically resulting in a net free of space
+ */
+ dmu_tx_mark_netfree(tx);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked) {
+ zfs_unlinked_add(zp, tx);
+ vp->v_vflag |= VV_NOSYNC;
+ }
+ /* XXX check changes to linux vnops */
+ txtype = TX_REMOVE;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+ dmu_tx_commit(tx);
+out:
+
+ if (xzp)
+ vrele(ZTOV(xzp));
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+
+int
+zfs_lookup_internal(znode_t *dzp, char *name, vnode_t **vpp,
+ struct componentname *cnp, int nameiop)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int error;
+
+ cnp->cn_nameptr = name;
+ cnp->cn_namelen = strlen(name);
+ cnp->cn_nameiop = nameiop;
+ cnp->cn_flags = ISLASTCN | SAVENAME;
+ cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+ cnp->cn_cred = kcred;
+ cnp->cn_thread = curthread;
+
+ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
+ struct vop_lookup_args a;
+
+ a.a_gen.a_desc = &vop_lookup_desc;
+ a.a_dvp = ZTOV(dzp);
+ a.a_vpp = vpp;
+ a.a_cnp = cnp;
+ error = vfs_cache_lookup(&a);
+ } else {
+ error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred,
+ curthread, 0, B_FALSE);
+ }
+#ifdef ZFS_DEBUG
+ if (error) {
+ printf("got error %d on name %s on op %d\n", error, name,
+ nameiop);
+ kdb_backtrace();
+ }
+#endif
+ return (error);
+}
+
+int
+zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
+{
+ vnode_t *vp;
+ int error;
+ struct componentname cn;
+
+ if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+ return (error);
+
+ error = zfs_remove_(ZTOV(dzp), vp, name, cr);
+ vput(vp);
+ return (error);
+}
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dvp - vnode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ * vsecp - ACL to be set
+ *
+ * OUT: vpp - vnode of created directory.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ * vp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr,
+ int flags, vsecattr_t *vsecp)
+{
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t txtype;
+ dmu_tx_t *tx;
+ int error;
+ ksid_t *ksid;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+
+ ASSERT(vap->va_type == VDIR);
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ ksid = crgetsid(cr, KSID_OWNER);
+ if (ksid)
+ uid = ksid_getid(ksid);
+ else
+ uid = crgetuid(cr);
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ ((vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (dzp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(dirname,
+ strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & AT_XVATTR) {
+ if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_type)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
+ */
+ *zpp = NULL;
+
+ if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ getnewvnode_reserve_();
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+
+ *zpp = zp;
+
+ txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
+ acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dvp - vnode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - vnode of current working directory.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rmdir_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
+ zilog = zfsvfs->z_log;
+
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ if (vp->v_type != VDIR) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+
+ vnevent_rmdir(vp, dvp, name, ct);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ cache_purge(dvp);
+
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
+
+ if (error == 0) {
+ uint64_t txtype = TX_RMDIR;
+ zfs_log_remove(zilog, tx, txtype, dzp, name,
+ ZFS_NO_OBJECT, B_FALSE);
+ }
+
+ dmu_tx_commit(tx);
+
+ cache_purge(vp);
+out:
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+int
+zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags)
+{
+ struct componentname cn;
+ vnode_t *vp;
+ int error;
+
+ if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+ return (error);
+
+ error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure).
+ *
+ * IN: vp - vnode of directory to read.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ * eofp - set to true if end-of-file detected.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
+ int *ncookies, ulong_t **cookies)
+{
+ znode_t *zp = VTOZ(vp);
+ iovec_t *iovp;
+ edirent_t *eodp;
+ dirent64_t *odp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os;
+ caddr_t outbuf;
+ size_t bufsize;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ uint_t bytes_wanted;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+ uint64_t parent;
+ int local_eof;
+ int outcount;
+ int error;
+ uint8_t prefetch;
+ boolean_t check_sysattrs;
+ uint8_t type;
+ int ncooks;
+ ulong_t *cooks = NULL;
+ int flags = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If we are not given an eof variable,
+ * use a local one.
+ */
+ if (eofp == NULL)
+ eofp = &local_eof;
+
+ /*
+ * Check for valid iov_len.
+ */
+ if (uio->uio_iov->iov_len <= 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if ((*eofp = zp->z_unlinked) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = uio->uio_loffset;
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Get space to change directory entries into fs independent format.
+ */
+ iovp = uio->uio_iov;
+ bytes_wanted = iovp->iov_len;
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+ bufsize = bytes_wanted;
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+ odp = (struct dirent64 *)outbuf;
+ } else {
+ bufsize = bytes_wanted;
+ outbuf = NULL;
+ odp = (struct dirent64 *)iovp->iov_base;
+ }
+ eodp = (struct edirent *)odp;
+
+ if (ncookies != NULL) {
+ /*
+ * Minimum entry size is dirent size and 1 byte for a file name.
+ */
+ ncooks = uio->uio_resid / (sizeof (struct dirent) -
+ sizeof (((struct dirent *)NULL)->d_name) + 1);
+ cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK);
+ *cookies = cooks;
+ *ncookies = ncooks;
+ }
+ /*
+ * If this VFS supports the system attribute view interface; and
+ * we're looking at an extended attribute directory; and we care
+ * about normalization conflicts on this vfs; then we must check
+ * for normalization conflicts with the sysattr name space.
+ */
+#ifdef TODO
+ check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
+ (flags & V_RDDIR_ENTFLAGS);
+#else
+ check_sysattrs = 0;
+#endif
+
+ /*
+ * Transform to file-system independent format
+ */
+ outcount = 0;
+ while (outcount < bytes_wanted) {
+ ino64_t objnum;
+ ushort_t reclen;
+ off64_t *next = NULL;
+
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ zap.za_normalization_conflict = 0;
+ objnum = zp->z_id;
+ type = DT_DIR;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ zap.za_normalization_conflict = 0;
+ objnum = parent;
+ type = DT_DIR;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ zap.za_normalization_conflict = 0;
+ objnum = ZFSCTL_INO_ROOT;
+ type = DT_DIR;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if ((error = zap_cursor_retrieve(&zc, &zap))) {
+ if ((*eofp = (error == ENOENT)) != 0)
+ break;
+ else
+ goto update;
+ }
+
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers != 1) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset);
+ error = SET_ERROR(ENXIO);
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ /*
+ * MacOS X can extract the object type here such as:
+ * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ */
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+
+ if (check_sysattrs && !zap.za_normalization_conflict) {
+#ifdef TODO
+ zap.za_normalization_conflict =
+ xattr_sysattr_casechk(zap.za_name);
+#else
+ panic("%s:%u: TODO", __func__, __LINE__);
+#endif
+ }
+ }
+
+ if (flags & V_RDDIR_ACCFILTER) {
+ /*
+ * If we have no access at all, don't include
+ * this entry in the returned information
+ */
+ znode_t *ezp;
+ if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+ goto skip_entry;
+ if (!zfs_has_access(ezp, cr)) {
+ vrele(ZTOV(ezp));
+ goto skip_entry;
+ }
+ vrele(ZTOV(ezp));
+ }
+
+ if (flags & V_RDDIR_ENTFLAGS)
+ reclen = EDIRENT_RECLEN(strlen(zap.za_name));
+ else
+ reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+ /*
+ * Will this entry fit in the buffer?
+ */
+ if (outcount + reclen > bufsize) {
+ /*
+ * Did we manage to fit anything in the buffer?
+ */
+ if (!outcount) {
+ error = SET_ERROR(EINVAL);
+ goto update;
+ }
+ break;
+ }
+ if (flags & V_RDDIR_ENTFLAGS) {
+ /*
+ * Add extended flag entry:
+ */
+ eodp->ed_ino = objnum;
+ eodp->ed_reclen = reclen;
+ /* NOTE: ed_off is the offset for the *next* entry */
+ next = &(eodp->ed_off);
+ eodp->ed_eflags = zap.za_normalization_conflict ?
+ ED_CASE_CONFLICT : 0;
+ (void) strncpy(eodp->ed_name, zap.za_name,
+ EDIRENT_NAMELEN(reclen));
+ eodp = (edirent_t *)((intptr_t)eodp + reclen);
+ } else {
+ /*
+ * Add normal entry:
+ */
+ odp->d_ino = objnum;
+ odp->d_reclen = reclen;
+ odp->d_namlen = strlen(zap.za_name);
+ /* NOTE: d_off is the offset for the *next* entry. */
+ next = &odp->d_off;
+ strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+ odp->d_type = type;
+ dirent_terminate(odp);
+ odp = (dirent64_t *)((intptr_t)odp + reclen);
+ }
+ outcount += reclen;
+
+ ASSERT(outcount <= bufsize);
+
+ /* Prefetch znode */
+ if (prefetch)
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
+
+ skip_entry:
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+
+ /* Fill the offset right after advancing the cursor. */
+ if (next != NULL)
+ *next = offset;
+ if (cooks != NULL) {
+ *cooks++ = offset;
+ ncooks--;
+ KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+ }
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+ /* Subtract unused cookies */
+ if (ncookies != NULL)
+ *ncookies -= ncooks;
+
+ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+ iovp->iov_base += outcount;
+ iovp->iov_len -= outcount;
+ uio->uio_resid -= outcount;
+ } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
+ /*
+ * Reset the pointer.
+ */
+ offset = uio->uio_loffset;
+ }
+
+update:
+ zap_cursor_fini(&zc);
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+ kmem_free(outbuf, bufsize);
+
+ if (error == ENOENT)
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ uio->uio_loffset = offset;
+ ZFS_EXIT(zfsvfs);
+ if (error != 0 && cookies != NULL) {
+ free(*cookies, M_TEMP);
+ *cookies = NULL;
+ *ncookies = 0;
+ }
+ return (error);
+}
+
+ulong_t zfs_fsync_sync_cnt = 4;
+
+static int
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
+ tsd_set(zfs_fsyncer_key, NULL);
+ return (0);
+}
+
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: vp - vnode of file.
+ * vap - va_mask identifies requested attributes.
+ * If AT_XVATTR set, then optional attrs are requested
+ * flags - ATTR_NOACLCHECK (CIFS server context)
+ * cr - credentials of caller.
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds).
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error = 0;
+ uint32_t blksize;
+ u_longlong_t nblocks;
+ uint64_t mtime[2], ctime[2], crtime[2], rdev;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap = NULL;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+ * Also, if we are the owner don't bother, since owner should
+ * always be allowed to read basic attributes of file.
+ */
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
+ if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+ skipaclchk, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+
+ vap->va_type = IFTOVT(zp->z_mode);
+ vap->va_mode = zp->z_mode & ~S_IFMT;
+ vn_fsid(vp, vap);
+ vap->va_nodeid = zp->z_id;
+ vap->va_nlink = zp->z_links;
+ if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
+ zp->z_links < ZFS_LINK_MAX)
+ vap->va_nlink++;
+ vap->va_size = zp->z_size;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ vap->va_rdev = zfs_cmpldev(rdev);
+ vap->va_seq = zp->z_seq;
+ vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_filerev = zp->z_seq;
+
+ /*
+ * Add in any requested optional attributes and the create time.
+ * Also set the corresponding bits in the returned attribute bitmap.
+ */
+ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ xoap->xoa_archive =
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ xoap->xoa_readonly =
+ ((zp->z_pflags & ZFS_READONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ xoap->xoa_system =
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ xoap->xoa_hidden =
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ xoap->xoa_nounlink =
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ xoap->xoa_immutable =
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ xoap->xoa_appendonly =
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ xoap->xoa_nodump =
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ xoap->xoa_opaque =
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ xoap->xoa_av_quarantined =
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ xoap->xoa_av_modified =
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+ vp->v_type == VREG) {
+ zfs_sa_get_scanstamp(zp, xvap);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = zp->z_gen;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ xoap->xoa_projinherit =
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
+ XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ xoap->xoa_projid = zp->z_projid;
+ XVA_SET_RTN(xvap, XAT_PROJID);
+ }
+ }
+
+ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
+
+
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ vap->va_blksize = blksize;
+ vap->va_bytes = nblocks << 9; /* nblocks * 512 */
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: zp - znode of file to be modified.
+ * vap - new attribute values.
+ * If AT_XVATTR set, then optional attrs are being set
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * - ATTR_NOACLCHECK (CIFS context only).
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+ vnode_t *vp = ZTOV(zp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ xvattr_t tmpxvattr;
+ uint_t mask = vap->va_mask;
+ uint_t saved_mask = 0;
+ uint64_t saved_mode;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ uint64_t new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2];
+ uint64_t projid = ZFS_INVALID_PROJID;
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int err, err2;
+ zfs_fuid_info_t *fuidp = NULL;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap;
+ zfs_acl_t *aclp;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ sa_bulk_attr_t bulk[7], xattr_bulk[7];
+ int count = 0, xattr_count = 0;
+
+ if (mask == 0)
+ return (0);
+
+ if (mask & AT_NOSET)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Make sure that if we have ephemeral uid/gid or xvattr specified
+ * that file system is at proper version level
+ */
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+ ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+ (mask & AT_XVATTR))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (mask & AT_SIZE && vp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If this is an xvattr_t, then get a pointer to the structure of
+ * optional attributes. If this is NULL, then we have a vattr_t.
+ */
+ xoap = xva_getxoptattr(xvap);
+
+ xva_init(&tmpxvattr);
+
+ /*
+ * Immutable files can only alter immutable bit and atime
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+ ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
+ ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * Note: ZFS_READONLY is handled in zfs_zaccess_common.
+ */
+
+ /*
+ * Verify timestamps doesn't overflow 32 bits.
+ * ZFS can handle large timestamps, but 32bit syscalls can't
+ * handle times greater than 2039. This check should be removed
+ * once large timestamps are fully supported.
+ */
+ if (mask & (AT_ATIME | AT_MTIME)) {
+ if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+ ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ }
+ if (xoap != NULL && (mask & AT_XVATTR)) {
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOVERFLOW));
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ if (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ projid = xoap->xoa_projid;
+ if (unlikely(projid == ZFS_INVALID_PROJID)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+ projid = ZFS_INVALID_PROJID;
+ else
+ need_policy = TRUE;
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+ (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+ (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+ }
+
+ attrzp = NULL;
+ aclp = NULL;
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & AT_SIZE) {
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ /* XXX - would it be OK to generate a log record here? */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ if (mask & (AT_ATIME|AT_MTIME) ||
+ ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+ XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+ XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+ XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+ need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+ skipaclchk, cr);
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+ int idmask = (mask & (AT_UID|AT_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & AT_MODE))
+ vap->va_mode = zp->z_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & AT_GID) &&
+ zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+ /*
+ * If both AT_UID and AT_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+ ((idmask == AT_UID) && take_owner) ||
+ ((idmask == AT_GID) && take_group)) {
+ if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+ skipaclchk, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ secpolicy_setid_clear(vap, vp, cr);
+ trim_mask = (mask & (AT_UID|AT_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ oldva.va_mode = zp->z_mode;
+ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+ if (mask & AT_XVATTR) {
+ /*
+ * Update xvattr mask to include only those attributes
+ * that are actually changing.
+ *
+ * the bits will be restored prior to actually setting
+ * the attributes so the caller thinks they were set.
+ */
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ if (xoap->xoa_appendonly !=
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ if (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+ XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ if (xoap->xoa_nounlink !=
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ if (xoap->xoa_immutable !=
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ if (xoap->xoa_nodump !=
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NODUMP);
+ XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ if (xoap->xoa_av_modified !=
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+ XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ if ((vp->v_type != VREG &&
+ xoap->xoa_av_quarantined) ||
+ xoap->xoa_av_quarantined !=
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+ XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (need_policy == FALSE &&
+ (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+ XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+ need_policy = TRUE;
+ }
+ }
+
+ if (mask & AT_MODE) {
+ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(vp, vap,
+ &oldva, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ trim_mask |= AT_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+ if (trim_mask & AT_MODE) {
+ /*
+ * Save the mode, as secpolicy_vnode_setattr()
+ * will overwrite it with ova.va_mode.
+ */
+ saved_mode = vap->va_mode;
+ }
+ }
+ err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ if (trim_mask) {
+ vap->va_mask |= saved_mask;
+ if (trim_mask & AT_MODE) {
+ /*
+ * Recover the mode after
+ * secpolicy_vnode_setattr().
+ */
+ vap->va_mode = saved_mode;
+ }
+ }
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+ if (err == 0) {
+ err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+ if (err != 0)
+ vrele(ZTOV(attrzp));
+ }
+ if (err)
+ goto out2;
+ }
+ if (mask & AT_UID) {
+ new_uid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+ if (new_uid != zp->z_uid &&
+ zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+ new_uid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (mask & AT_GID) {
+ new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &fuidp);
+ if (new_gid != zp->z_gid &&
+ zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ new_gid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+ tx = dmu_tx_create(os);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+ !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+ err = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+ goto out;
+
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ }
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if (((mask & AT_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+ (projid != ZFS_INVALID_PROJID &&
+ !(zp->z_pflags & ZFS_PROJID)))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ }
+
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ goto out;
+
+ count = 0;
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+ /*
+ * For the existed object that is upgraded from old system,
+ * its on-disk layout has no slot for the project ID attribute.
+ * But quota accounting logic needs to access related slots by
+ * offset directly. So we need to adjust old objects' layout
+ * to make the project ID to some unified and fixed offset.
+ */
+ if (attrzp)
+ err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+ if (err == 0)
+ err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+ if (unlikely(err == EEXIST))
+ err = 0;
+ else if (err != 0)
+ goto out;
+ else
+ projid = ZFS_INVALID_PROJID;
+ }
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&zp->z_acl_lock);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ if (projid != ZFS_INVALID_PROJID) {
+ attrzp->z_projid = projid;
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+ sizeof (attrzp->z_projid));
+ }
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+
+ if (mask & AT_UID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ zp->z_uid = new_uid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ attrzp->z_uid = new_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ zp->z_gid = new_gid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ attrzp->z_gid = new_gid;
+ }
+ }
+ if (!(mask & AT_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
+ if (mask & AT_MODE) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = new_mode;
+ ASSERT3U((uintptr_t)aclp, !=, 0);
+ err = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT0(err);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = aclp;
+ aclp = NULL;
+ }
+
+
+ if (mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ }
+
+ if (mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ }
+
+ if (projid != ZFS_INVALID_PROJID) {
+ zp->z_projid = projid;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+ sizeof (zp->z_projid));
+ }
+
+ /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
+ if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ } else if (mask != 0) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+ mtime, ctime);
+ }
+ }
+
+ /*
+ * Do this after setting timestamps to prevent timestamp
+ * update from toggling bit
+ */
+
+ if (xoap && (mask & AT_XVATTR)) {
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ xoap->xoa_createtime = vap->va_birthtime;
+ /*
+ * restore trimmed off masks
+ * so that return masks can be set for caller.
+ */
+
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+ XVA_SET_REQ(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+ XVA_SET_REQ(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+ XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+ XVA_SET_REQ(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+ XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
+ XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ ASSERT(vp->v_type == VREG);
+
+ zfs_xvattr_set(zp, xvap, tx);
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&zp->z_acl_lock);
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ }
+out:
+ if (err == 0 && attrzp) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
+ if (attrzp)
+ vput(ZTOV(attrzp));
+
+ if (aclp)
+ zfs_acl_free(aclp);
+
+ if (fuidp) {
+ zfs_fuid_info_free(fuidp);
+ fuidp = NULL;
+ }
+
+ if (err) {
+ dmu_tx_abort(tx);
+ } else {
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ }
+
+out2:
+ if (os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * We acquire all but fdvp locks using non-blocking acquisitions. If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename. This acquire/release step ensures that we do not
+ * spin on a lock waiting for release. On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
+ */
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+ struct vnode *tdvp, struct vnode **tvpp,
+ const struct componentname *scnp, const struct componentname *tcnp)
+{
+ zfsvfs_t *zfsvfs;
+ struct vnode *nvp, *svp, *tvp;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ const char *snm = scnp->cn_nameptr;
+ const char *tnm = tcnp->cn_nameptr;
+ int error;
+
+ VOP_UNLOCK1(tdvp);
+ if (*tvpp != NULL && *tvpp != tdvp)
+ VOP_UNLOCK1(*tvpp);
+
+relock:
+ error = vn_lock(sdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ sdzp = VTOZ(sdvp);
+
+ error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK1(sdvp);
+ if (error != EBUSY)
+ goto out;
+ error = vn_lock(tdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ VOP_UNLOCK1(tdvp);
+ goto relock;
+ }
+ tdzp = VTOZ(tdvp);
+
+ /*
+ * Before using sdzp and tdzp we must ensure that they are live.
+ * As a porting legacy from illumos we have two things to worry
+ * about. One is typical for FreeBSD and it is that the vnode is
+ * not reclaimed (doomed). The other is that the znode is live.
+ * The current code can invalidate the znode without acquiring the
+ * corresponding vnode lock if the object represented by the znode
+ * and vnode is no longer valid after a rollback or receive operation.
+ * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+ * that protects the znodes from the invalidation.
+ */
+ zfsvfs = sdzp->z_zfsvfs;
+ ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
+ */
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ /*
+ * Re-resolve svp to be certain it still exists and fetch the
+ * correct vnode.
+ */
+ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+ if (error != 0) {
+ /* Source entry invalid or not there. */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+ (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ svp = ZTOV(szp);
+
+ /*
+ * Re-resolve tvp, if it disappeared we just carry on.
+ */
+ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ vrele(svp);
+ if ((tcnp->cn_flags & ISDOTDOT) != 0)
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ if (tzp != NULL)
+ tvp = ZTOV(tzp);
+ else
+ tvp = NULL;
+
+ /*
+ * At present the vnode locks must be acquired before z_teardown_lock,
+ * although it would be more logical to use the opposite order.
+ */
+ ZFS_EXIT(zfsvfs);
+
+ /*
+ * Now try acquire locks on svp and tvp.
+ */
+ nvp = svp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ VOP_UNLOCK1(nvp);
+ /*
+ * Concurrent rename race.
+ * XXX ?
+ */
+ if (nvp == tdvp) {
+ vrele(nvp);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+ goto relock;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+
+ if (*tvpp != NULL)
+ vrele(*tvpp);
+ *tvpp = NULL;
+ if (tvp != NULL) {
+ nvp = tvp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK1(sdvp);
+ VOP_UNLOCK1(tdvp);
+ VOP_UNLOCK1(*svpp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ vput(nvp);
+ goto relock;
+ }
+ *tvpp = nvp;
+ }
+
+ return (0);
+
+out:
+ return (error);
+}
+
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *zp, *zp1;
+ uint64_t parent;
+ int error;
+
+ zfsvfs = tdzp->z_zfsvfs;
+ if (tdzp == szp)
+ return (SET_ERROR(EINVAL));
+ if (tdzp == sdzp)
+ return (0);
+ if (tdzp->z_id == zfsvfs->z_root)
+ return (0);
+ zp = tdzp;
+ for (;;) {
+ ASSERT(!zp->z_unlinked);
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ break;
+
+ if (parent == szp->z_id) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ if (parent == zfsvfs->z_root)
+ break;
+ if (parent == sdzp->z_id)
+ break;
+
+ error = zfs_zget(zfsvfs, parent, &zp1);
+ if (error != 0)
+ break;
+
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_zrele_taskq(
+ dmu_objset_pool(zfsvfs->z_os)));
+ zp = zp1;
+ }
+
+ if (error == ENOTDIR)
+ panic("checkpath: .. not a directory\n");
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ return (error);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdvp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdvp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * sdvp,tdvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+ vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+ cred_t *cr, int log)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ zilog_t *zilog = NULL;
+ dmu_tx_t *tx;
+ char *snm = scnp->cn_nameptr;
+ char *tnm = tcnp->cn_nameptr;
+ int error = 0;
+
+ /* Reject renames across filesystems. */
+ if ((*svpp)->v_mount != tdvp->v_mount ||
+ ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ if (zfsctl_is_node(tdvp)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ /*
+ * Lock all four vnodes to ensure safety and semantics of renaming.
+ */
+ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+ if (error != 0) {
+ /* no vnodes are locked in the case of error here */
+ return (error);
+ }
+
+ tdzp = VTOZ(tdvp);
+ sdzp = VTOZ(sdvp);
+ zfsvfs = tdzp->z_zfsvfs;
+ zilog = zfsvfs->z_log;
+
+ /*
+ * After we re-enter ZFS_ENTER() we will have to revalidate all
+ * znodes involved.
+ */
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_utf8 && u8_validate(tnm,
+ strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ error = SET_ERROR(EILSEQ);
+ goto unlockout;
+ }
+
+ /* If source and target are the same file, there is nothing to do. */
+ if ((*svpp) == (*tvpp)) {
+ error = 0;
+ goto unlockout;
+ }
+
+ if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+ ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+ (*tvpp)->v_mountedhere != NULL)) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
+ }
+
+ /*
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
+ */
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
+ }
+
+ szp = VTOZ(*svpp);
+ tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+ if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
+ }
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ error = SET_ERROR(EINVAL);
+ goto unlockout;
+ }
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow renames into our tree when the project
+ * IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
+ }
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+ goto unlockout;
+
+ if ((*svpp)->v_type == VDIR) {
+ /*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+ sdzp == szp ||
+ (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+ error = EINVAL;
+ goto unlockout;
+ }
+
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if ((error = zfs_rename_check(szp, sdzp, tdzp)))
+ goto unlockout;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if ((*svpp)->v_type == VDIR) {
+ if ((*tvpp)->v_type != VDIR) {
+ error = SET_ERROR(ENOTDIR);
+ goto unlockout;
+ } else {
+ cache_purge(tdvp);
+ if (sdvp != tdvp)
+ cache_purge(sdvp);
+ }
+ } else {
+ if ((*tvpp)->v_type == VDIR) {
+ error = SET_ERROR(EISDIR);
+ goto unlockout;
+ }
+ }
+ }
+
+ vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
+ if (tzp)
+ vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
+
+ /*
+ * notify the target directory if it is not the same
+ * as source directory.
+ */
+ if (tdvp != sdvp) {
+ vnevent_rename_dest_dir(tdvp, ct);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto unlockout;
+ }
+
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
+ if (error == 0) {
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT0(error);
+
+ error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+ NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ snm, tdzp, tnm, szp);
+
+ /*
+ * Update path information for the target vnode
+ */
+ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
+ }
+ if (error == 0) {
+ cache_purge(*svpp);
+ if (*tvpp != NULL)
+ cache_purge(*tvpp);
+ cache_purge_negative(tdvp);
+ }
+ }
+
+ dmu_tx_commit(tx);
+
+unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK1(*svpp);
+ VOP_UNLOCK1(sdvp);
+
+out: /* original two vnodes are locked */
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (*tvpp != NULL)
+ VOP_UNLOCK1(*tvpp);
+ if (tdvp != *tvpp)
+ VOP_UNLOCK1(tdvp);
+ return (error);
+}
+
+int
+zfs_rename(znode_t *sdzp, char *sname, znode_t *tdzp, char *tname,
+ cred_t *cr, int flags)
+{
+ struct componentname scn, tcn;
+ vnode_t *sdvp, *tdvp;
+ vnode_t *svp, *tvp;
+ int error;
+ svp = tvp = NULL;
+
+ sdvp = ZTOV(sdzp);
+ tdvp = ZTOV(tdzp);
+ error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
+ if (sdzp->z_zfsvfs->z_replay == B_FALSE)
+ VOP_UNLOCK1(sdvp);
+ if (error != 0)
+ goto fail;
+ VOP_UNLOCK1(svp);
+
+ vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+ error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
+ if (error == EJUSTRETURN)
+ tvp = NULL;
+ else if (error != 0) {
+ VOP_UNLOCK1(tdvp);
+ goto fail;
+ }
+
+ error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0);
+fail:
+ if (svp != NULL)
+ vrele(svp);
+ if (tvp != NULL)
+ vrele(tvp);
+
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dvp - Directory to contain new symbolic link.
+ * link - Name for new symlink entry.
+ * vap - Attributes of new entry.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
+ const char *link, znode_t **zpp, cred_t *cr, int flags)
+{
+ znode_t *zp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t len = strlen(link);
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
+
+ ASSERT(vap->va_type == VLNK);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
+ 0 /* projid */)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ getnewvnode_reserve_();
+ tx = dmu_tx_create(zfsvfs->z_os);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create a new object for the symlink.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ __DECONST(void *, link), len, tx);
+ else
+ zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
+
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
+ /*
+ * Insert the new object into the directory.
+ */
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp,
+ __DECONST(char *, name), __DECONST(char *, link));
+ *zpp = zp;
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ * IN: vp - vnode of symbolic link.
+ * uio - structure to contain the link path.
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: uio - structure containing the link path.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ * IN: tdvp - Directory to contain new entry.
+ * svp - vnode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * tdvp - ctime|mtime updated
+ * svp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
+ int flags)
+{
+ znode_t *tzp;
+ zfsvfs_t *zfsvfs = tdzp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t parent;
+ uid_t owner;
+
+ ASSERT(ZTOV(tdzp)->v_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(tdzp);
+ zilog = zfsvfs->z_log;
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (ZTOV(szp)->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ ZFS_VERIFY_ZP(szp);
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow hard link creation in our tree when the
+ * project IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ if (szp->z_pflags & (ZFS_APPENDONLY |
+ ZFS_IMMUTABLE | ZFS_READONLY)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /* Prevent links to .zfs/shares files */
+
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(name,
+ strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+
+ owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_create(tdzp, name, szp, tx, 0);
+
+ if (error == 0) {
+ uint64_t txtype = TX_LINK;
+ zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (error == 0) {
+ vnevent_link(ZTOV(szp), ct);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: ip - inode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Permissions aren't checked on Solaris because on this OS
+ * zfs_space() can only be called with an opened file handle.
+ * On Linux we can get here through truncate_range() which
+ * operates directly on inodes, so we need to check access rights.
+ */
+ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len; /* 0 means from off to end of file */
+
+ error = zfs_freesp(zp, off, len, flag, TRUE);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ if (zp->z_sa_hdl == NULL) {
+ /*
+ * The fs has been unmounted, or we did a
+ * suspend/resume and this file no longer exists.
+ */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ vrecycle(vp);
+ return;
+ }
+
+ if (zp->z_unlinked) {
+ /*
+ * Fast path to recycle a vnode of a removed file.
+ */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ vrecycle(vp);
+ return;
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
+ zp->z_atime_dirty = 0;
+ dmu_tx_commit(tx);
+ }
+ }
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
+
+
+CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid));
+CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid));
+
+/*ARGSUSED*/
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint32_t gen;
+ uint64_t gen64;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i, error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
+
+ size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+ fidp->fid_len = size;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ if (size == LONG_FID_LEN) {
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ zfid_long_t *zlfid;
+
+ zlfid = (zfid_long_t *)fidp;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ /* XXX - this should be the generation number for the objset */
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+
+ switch (cmd) {
+ case _PC_LINK_MAX:
+ *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *valp = 64;
+ return (0);
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+ case _PC_ACL_EXTENDED:
+ *valp = 0;
+ return (0);
+
+ case _PC_ACL_NFS4:
+ *valp = 1;
+ return (0);
+
+ case _PC_ACL_PATH_MAX:
+ *valp = ACL_MAX_ENTRIES;
+ return (0);
+
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
+ int *rahead)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zp->z_zfsvfs->z_os;
+ zfs_locked_range_t *lr;
+ vm_object_t object;
+ off_t start, end, obj_size;
+ uint_t blksz;
+ int pgsin_b, pgsin_a;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ start = IDX_TO_OFF(ma[0]->pindex);
+ end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+ /*
+ * Lock a range covering all required and optional pages.
+ * Note that we need to handle the case of the block size growing.
+ */
+ for (;;) {
+ blksz = zp->z_blksz;
+ lr = zfs_rangelock_enter(&zp->z_rangelock,
+ rounddown(start, blksz),
+ roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+ if (blksz == zp->z_blksz)
+ break;
+ zfs_rangelock_exit(lr);
+ }
+
+ object = ma[0]->object;
+ zfs_vmobject_wlock(object);
+ obj_size = object->un_pager.vnp.vnp_size;
+ zfs_vmobject_wunlock(object);
+ if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (zfs_vm_pagerret_bad);
+ }
+
+ pgsin_b = 0;
+ if (rbehind != NULL) {
+ pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+ pgsin_b = MIN(*rbehind, pgsin_b);
+ }
+
+ pgsin_a = 0;
+ if (rahead != NULL) {
+ pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+ if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+ pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+ pgsin_a = MIN(*rahead, pgsin_a);
+ }
+
+ /*
+ * NB: we need to pass the exact byte size of the data that we expect
+ * to read after accounting for the file size. This is required because
+ * ZFS will panic if we request DMU to read beyond the end of the last
+ * allocated block.
+ */
+ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+ MIN(end, obj_size) - (end - PAGE_SIZE));
+
+ zfs_rangelock_exit(lr);
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+
+ if (error != 0)
+ return (zfs_vm_pagerret_error);
+
+ VM_CNT_INC(v_vnodein);
+ VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
+ if (rbehind != NULL)
+ *rbehind = pgsin_b;
+ if (rahead != NULL)
+ *rahead = pgsin_a;
+ return (zfs_vm_pagerret_ok);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getpages_args {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int *a_rbehind;
+ int *a_rahead;
+};
+#endif
+
+static int
+zfs_freebsd_getpages(struct vop_getpages_args *ap)
+{
+
+ return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+ ap->a_rahead));
+}
+
+static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+ int *rtvals)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_locked_range_t *lr;
+ dmu_tx_t *tx;
+ struct sf_buf *sf;
+ vm_object_t object;
+ vm_page_t m;
+ caddr_t va;
+ size_t tocopy;
+ size_t lo_len;
+ vm_ooffset_t lo_off;
+ vm_ooffset_t off;
+ uint_t blksz;
+ int ncount;
+ int pcount;
+ int err;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ object = vp->v_object;
+ pcount = btoc(len);
+ ncount = pcount;
+
+ KASSERT(ma[0]->object == object, ("mismatching object"));
+ KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+ for (i = 0; i < pcount; i++)
+ rtvals[i] = zfs_vm_pagerret_error;
+
+ off = IDX_TO_OFF(ma[0]->pindex);
+ blksz = zp->z_blksz;
+ lo_off = rounddown(off, blksz);
+ lo_len = roundup(len + (off - lo_off), blksz);
+ lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
+
+ zfs_vmobject_wlock(object);
+ if (len + off > object->un_pager.vnp.vnp_size) {
+ if (object->un_pager.vnp.vnp_size > off) {
+ int pgoff;
+
+ len = object->un_pager.vnp.vnp_size - off;
+ ncount = btoc(len);
+ if ((pgoff = (int)len & PAGE_MASK) != 0) {
+ /*
+ * If the object is locked and the following
+ * conditions hold, then the page's dirty
+ * field cannot be concurrently changed by a
+ * pmap operation.
+ */
+ m = ma[ncount - 1];
+ vm_page_assert_sbusied(m);
+ KASSERT(!pmap_page_is_write_mapped(m),
+ ("zfs_putpages: page %p is not read-only",
+ m));
+ vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+ pgoff);
+ }
+ } else {
+ len = 0;
+ ncount = 0;
+ }
+ if (ncount < pcount) {
+ for (i = ncount; i < pcount; i++) {
+ rtvals[i] = zfs_vm_pagerret_bad;
+ }
+ }
+ }
+ zfs_vmobject_wunlock(object);
+
+ if (ncount == 0)
+ goto out;
+
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
+ (zp->z_projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ zp->z_projid))) {
+ goto out;
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ if (zp->z_blksz < PAGE_SIZE) {
+ for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+ tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+ va = zfs_map_page(ma[i], &sf);
+ dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+ zfs_unmap_page(sf);
+ }
+ } else {
+ err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+ }
+
+ if (err == 0) {
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(err);
+ /*
+ * XXX we should be passing a callback to undirty
+ * but that would make the locking messier
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
+ len, 0, NULL, NULL);
+
+ zfs_vmobject_wlock(object);
+ for (i = 0; i < ncount; i++) {
+ rtvals[i] = zfs_vm_pagerret_ok;
+ vm_page_undirty(ma[i]);
+ }
+ zfs_vmobject_wunlock(object);
+ VM_CNT_INC(v_vnodeout);
+ VM_CNT_ADD(v_vnodepgsout, ncount);
+ }
+ dmu_tx_commit(tx);
+
+out:
+ zfs_rangelock_exit(lr);
+ if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ return (rtvals[0]);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_putpages_args {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+};
+#endif
+
+int
+zfs_freebsd_putpages(struct vop_putpages_args *ap)
+{
+
+ return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+ ap->a_rtvals));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_bmap_args {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct bufobj **a_bop;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+};
+#endif
+
+static int
+zfs_freebsd_bmap(struct vop_bmap_args *ap)
+{
+
+ if (ap->a_bop != NULL)
+ *ap->a_bop = &ap->a_vp->v_bufobj;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn;
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ if (ap->a_runb != NULL)
+ *ap->a_runb = 0;
+
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_open_args {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_open(struct vop_open_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ int error;
+
+ error = zfs_open(&vp, ap->a_mode, ap->a_cred);
+ if (error == 0)
+ vnode_create_vobject(vp, zp->z_size, ap->a_td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_close_args {
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_close(struct vop_close_args *ap)
+{
+
+ return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_ioctl_args {
+ struct vnode *a_vp;
+ ulong_t a_command;
+ caddr_t a_data;
+ int a_fflag;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
+{
+
+ return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
+ ap->a_fflag, ap->a_cred, NULL));
+}
+
+static int
+ioflags(int ioflags)
+{
+ int flags = 0;
+
+ if (ioflags & IO_APPEND)
+ flags |= FAPPEND;
+ if (ioflags & IO_NDELAY)
+ flags |= FNONBLOCK;
+ if (ioflags & IO_SYNC)
+ flags |= (FSYNC | FDSYNC | FRSYNC);
+
+ return (flags);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_read_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_read(struct vop_read_args *ap)
+{
+
+ return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+ ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_write_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_write(struct vop_write_args *ap)
+{
+
+ return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+ ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_access_args {
+ struct vnode *a_vp;
+ accmode_t a_accmode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_access(struct vop_access_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ accmode_t accmode;
+ int error = 0;
+
+
+ if (ap->a_accmode == VEXEC) {
+ if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
+ return (0);
+ }
+
+ /*
+ * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
+ */
+ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
+ if (accmode != 0)
+ error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
+
+ /*
+ * VADMIN has to be handled by vaccess().
+ */
+ if (error == 0) {
+ accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
+ if (accmode != 0) {
+ error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+ zp->z_gid, accmode, ap->a_cred, NULL);
+ }
+ }
+
+ /*
+ * For VEXEC, ensure that at least one execute bit is set for
+ * non-directories.
+ */
+ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
+ (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
+ error = EACCES;
+ }
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+ struct componentname *cnp = ap->a_cnp;
+ char nm[NAME_MAX + 1];
+
+ ASSERT(cnp->cn_namelen < sizeof (nm));
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
+
+ return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
+ cnp->cn_cred, cnp->cn_thread, 0, cached));
+}
+
+static int
+zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
+{
+
+ return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_cache_lookup(struct vop_lookup_args *ap)
+{
+ zfsvfs_t *zfsvfs;
+
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ if (zfsvfs->z_use_namecache)
+ return (vfs_cache_lookup(ap));
+ else
+ return (zfs_freebsd_lookup(ap, B_FALSE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_create_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_create(struct vop_create_args *ap)
+{
+ zfsvfs_t *zfsvfs;
+ struct componentname *cnp = ap->a_cnp;
+ vattr_t *vap = ap->a_vap;
+ znode_t *zp = NULL;
+ int rc, mode;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ mode = vap->va_mode & ALLPERMS;
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ *ap->a_vpp = NULL;
+
+ rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode,
+ &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */);
+ if (rc == 0)
+ *ap->a_vpp = ZTOV(zp);
+ if (zfsvfs->z_use_namecache &&
+ rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+
+ return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_remove_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_remove(struct vop_remove_args *ap)
+{
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_mkdir_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
+{
+ vattr_t *vap = ap->a_vap;
+ znode_t *zp = NULL;
+ int rc;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ *ap->a_vpp = NULL;
+
+ rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
+ ap->a_cnp->cn_cred, 0, NULL);
+
+ if (rc == 0)
+ *ap->a_vpp = ZTOV(zp);
+ return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rmdir_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readdir_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *a_ncookies;
+ ulong_t **a_cookies;
+};
+#endif
+
+static int
+zfs_freebsd_readdir(struct vop_readdir_args *ap)
+{
+
+ return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
+ ap->a_ncookies, ap->a_cookies));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fsync_args {
+ struct vnode *a_vp;
+ int a_waitfor;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_fsync(struct vop_fsync_args *ap)
+{
+
+ vop_stdfsync(ap);
+ return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_getattr(struct vop_getattr_args *ap)
+{
+ vattr_t *vap = ap->a_vap;
+ xvattr_t xvap;
+ ulong_t fflags = 0;
+ int error;
+
+ xva_init(&xvap);
+ xvap.xva_vattr = *vap;
+ xvap.xva_vattr.va_mask |= AT_XVATTR;
+
+ /* Convert chflags into ZFS-type flags. */
+ /* XXX: what about SF_SETTABLE?. */
+ XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(&xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(&xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(&xvap, XAT_NODUMP);
+ XVA_SET_REQ(&xvap, XAT_READONLY);
+ XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+ XVA_SET_REQ(&xvap, XAT_SYSTEM);
+ XVA_SET_REQ(&xvap, XAT_HIDDEN);
+ XVA_SET_REQ(&xvap, XAT_REPARSE);
+ XVA_SET_REQ(&xvap, XAT_OFFLINE);
+ XVA_SET_REQ(&xvap, XAT_SPARSE);
+
+ error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
+ if (error != 0)
+ return (error);
+
+ /* Convert ZFS xattr into chflags. */
+#define FLAG_CHECK(fflag, xflag, xfield) do { \
+ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
+ fflags |= (fflag); \
+} while (0)
+ FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
+ xvap.xva_xoptattrs.xoa_immutable);
+ FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
+ xvap.xva_xoptattrs.xoa_appendonly);
+ FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
+ xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
+ FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
+ xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHECK(UF_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+
+#undef FLAG_CHECK
+ *vap = xvap.xva_vattr;
+ vap->va_flags = fflags;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setattr_args {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_setattr(struct vop_setattr_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cred = ap->a_cred;
+ xvattr_t xvap;
+ ulong_t fflags;
+ uint64_t zflags;
+
+ vattr_init_mask(vap);
+ vap->va_mask &= ~AT_NOSET;
+
+ xva_init(&xvap);
+ xvap.xva_vattr = *vap;
+
+ zflags = VTOZ(vp)->z_pflags;
+
+ if (vap->va_flags != VNOVAL) {
+ zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
+ int error;
+
+ if (zfsvfs->z_use_fuids == B_FALSE)
+ return (EOPNOTSUPP);
+
+ fflags = vap->va_flags;
+ /*
+ * XXX KDM
+ * We need to figure out whether it makes sense to allow
+ * UF_REPARSE through, since we don't really have other
+ * facilities to handle reparse points and zfs_setattr()
+ * doesn't currently allow setting that attribute anyway.
+ */
+ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+ UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+ UF_OFFLINE|UF_SPARSE)) != 0)
+ return (EOPNOTSUPP);
+ /*
+ * Unprivileged processes are not permitted to unset system
+ * flags, or modify flags if any system flags are set.
+ * Privileged non-jail processes may not modify system flags
+ * if securelevel > 0 and any existing system flags are set.
+ * Privileged jail processes behave like privileged non-jail
+ * processes if the PR_ALLOW_CHFLAGS permission bit is set;
+ * otherwise, they behave like unprivileged processes.
+ */
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
+ spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
+ if (zflags &
+ (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
+ error = securelevel_gt(cred, 0);
+ if (error != 0)
+ return (error);
+ }
+ } else {
+ /*
+ * Callers may only modify the file flags on
+ * objects they have VADMIN rights for.
+ */
+ if ((error = VOP_ACCESS(vp, VADMIN, cred,
+ curthread)) != 0)
+ return (error);
+ if (zflags &
+ (ZFS_IMMUTABLE | ZFS_APPENDONLY |
+ ZFS_NOUNLINK)) {
+ return (EPERM);
+ }
+ if (fflags &
+ (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
+ return (EPERM);
+ }
+ }
+
+#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
+ if (((fflags & (fflag)) && !(zflags & (zflag))) || \
+ ((zflags & (zflag)) && !(fflags & (fflag)))) { \
+ XVA_SET_REQ(&xvap, (xflag)); \
+ (xfield) = ((fflags & (fflag)) != 0); \
+ } \
+} while (0)
+ /* Convert chflags into ZFS-type flags. */
+ /* XXX: what about SF_SETTABLE?. */
+ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+ xvap.xva_xoptattrs.xoa_immutable);
+ FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+ xvap.xva_xoptattrs.xoa_appendonly);
+ FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
+ xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
+ FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+ xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+#undef FLAG_CHANGE
+ }
+ if (vap->va_birthtime.tv_sec != VNOVAL) {
+ xvap.xva_vattr.va_mask |= AT_XVATTR;
+ XVA_SET_REQ(&xvap, XAT_CREATETIME);
+ }
+ return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rename_args {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+};
+#endif
+
+static int
+zfs_freebsd_rename(struct vop_rename_args *ap)
+{
+ vnode_t *fdvp = ap->a_fdvp;
+ vnode_t *fvp = ap->a_fvp;
+ vnode_t *tdvp = ap->a_tdvp;
+ vnode_t *tvp = ap->a_tvp;
+ int error;
+
+ ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
+ ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
+
+ error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred, 1);
+
+ vrele(fdvp);
+ vrele(fvp);
+ vrele(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_symlink_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+};
+#endif
+
+static int
+zfs_freebsd_symlink(struct vop_symlink_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+ vattr_t *vap = ap->a_vap;
+ znode_t *zp = NULL;
+ int rc;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
+ vattr_init_mask(vap);
+ *ap->a_vpp = NULL;
+
+ rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
+ ap->a_target, &zp, cnp->cn_cred, 0 /* flags */);
+ if (rc == 0)
+ *ap->a_vpp = ZTOV(zp);
+ return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readlink_args {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_readlink(struct vop_readlink_args *ap)
+{
+
+ return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_link_args {
+ struct vnode *a_tdvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_link(struct vop_link_args *ap)
+{
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *vp = ap->a_vp;
+ vnode_t *tdvp = ap->a_tdvp;
+
+ if (tdvp->v_mount != vp->v_mount)
+ return (EXDEV);
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ return (zfs_link(VTOZ(tdvp), VTOZ(vp),
+ cnp->cn_nameptr, cnp->cn_cred, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_inactive_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_inactive(struct vop_inactive_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+
+ zfs_inactive(vp, ap->a_td->td_ucred, NULL);
+ return (0);
+}
+
+#if __FreeBSD_version >= 1300042
+#ifndef _SYS_SYSPROTO_H_
+struct vop_need_inactive_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ bool need;
+
+ if (!rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER))
+ return (true);
+ need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+
+ return (need);
+}
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp != NULL);
+
+#if __FreeBSD_version < 1300042
+ /* Destroy the vm object and flush associated pages. */
+ vnode_destroy_vobject(vp);
+#endif
+ /*
+ * z_teardown_inactive_lock protects from a race with
+ * zfs_znode_dmu_fini in zfsvfs_teardown during
+ * force unmount.
+ */
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ if (zp->z_sa_hdl == NULL)
+ zfs_znode_free(zp);
+ else
+ zfs_zinactive(zp);
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+
+ vp->v_data = NULL;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fid_args {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+};
+#endif
+
+static int
+zfs_freebsd_fid(struct vop_fid_args *ap)
+{
+
+ return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_pathconf_args {
+ struct vnode *a_vp;
+ int a_name;
+ register_t *a_retval;
+} *ap;
+#endif
+
+static int
+zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
+{
+ ulong_t val;
+ int error;
+
+ error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
+ curthread->td_ucred, NULL);
+ if (error == 0) {
+ *ap->a_retval = val;
+ return (error);
+ }
+ if (error != EOPNOTSUPP)
+ return (error);
+
+ switch (ap->a_name) {
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+ case _PC_PIPE_BUF:
+ if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ }
+ return (EINVAL);
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+/*
+ * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
+ * extended attribute name:
+ *
+ * NAMESPACE PREFIX
+ * system freebsd:system:
+ * user (none, can be used to access ZFS fsattr(5) attributes
+ * created on Solaris)
+ */
+static int
+zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
+ size_t size)
+{
+ const char *namespace, *prefix, *suffix;
+
+ /* We don't allow '/' character in attribute name. */
+ if (strchr(name, '/') != NULL)
+ return (EINVAL);
+ /* We don't allow attribute names that start with "freebsd:" string. */
+ if (strncmp(name, "freebsd:", 8) == 0)
+ return (EINVAL);
+
+ bzero(attrname, size);
+
+ switch (attrnamespace) {
+ case EXTATTR_NAMESPACE_USER:
+#if 0
+ prefix = "freebsd:";
+ namespace = EXTATTR_NAMESPACE_USER_STRING;
+ suffix = ":";
+#else
+ /*
+ * This is the default namespace by which we can access all
+ * attributes created on Solaris.
+ */
+ prefix = namespace = suffix = "";
+#endif
+ break;
+ case EXTATTR_NAMESPACE_SYSTEM:
+ prefix = "freebsd:";
+ namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
+ suffix = ":";
+ break;
+ case EXTATTR_NAMESPACE_EMPTY:
+ default:
+ return (EINVAL);
+ }
+ if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
+ name) >= size) {
+ return (ENAMETOOLONG);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operating to retrieve a named extended attribute.
+ */
+static int
+zfs_getextattr(struct vop_getextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VREAD);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof (attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ flags = FREAD;
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+ xvp, td);
+ error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ if (error == ENOENT)
+ error = ENOATTR;
+ return (error);
+ }
+
+ if (ap->a_size != NULL) {
+ error = VOP_GETATTR(vp, &va, ap->a_cred);
+ if (error == 0)
+ *ap->a_size = (size_t)va.va_size;
+ } else if (ap->a_uio != NULL)
+ error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+ VOP_UNLOCK1(vp);
+ vn_close(vp, flags, ap->a_cred, td);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_deleteextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+int
+zfs_deleteextattr(struct vop_deleteextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ vnode_t *xvp = NULL, *vp;
+ int error;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VWRITE);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof (attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
+ UIO_SYSSPACE, attrname, xvp, td);
+ error = namei(&nd);
+ vp = nd.ni_vp;
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error == ENOENT)
+ error = ENOATTR;
+ return (error);
+ }
+
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+static int
+zfs_setextattr(struct vop_setextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VWRITE);
+ if (error != 0)
+ return (error);
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof (attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ flags = FFLAGS(O_WRONLY | O_CREAT);
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+ xvp, td);
+ error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
+ NULL);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ VATTR_NULL(&va);
+ va.va_size = 0;
+ error = VOP_SETATTR(vp, &va, ap->a_cred);
+ if (error == 0)
+ VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+ VOP_UNLOCK1(vp);
+ vn_close(vp, flags, ap->a_cred, td);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_listextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to retrieve extended attributes on a vnode.
+ */
+static int
+zfs_listextattr(struct vop_listextattr_args *ap)
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrprefix[16];
+ uint8_t dirbuf[sizeof (struct dirent)];
+ struct dirent *dp;
+ struct iovec aiov;
+ struct uio auio, *uio = ap->a_uio;
+ size_t *sizep = ap->a_size;
+ size_t plen;
+ vnode_t *xvp = NULL, *vp;
+ int done, error, eof, pos;
+
+ /*
+ * If the xattr property is off, refuse the request.
+ */
+ if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VREAD);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
+ sizeof (attrprefix));
+ if (error != 0)
+ return (error);
+ plen = strlen(attrprefix);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (sizep != NULL)
+ *sizep = 0;
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ /*
+ * ENOATTR means that the EA directory does not yet exist,
+ * i.e. there are no extended attributes there.
+ */
+ if (error == ENOATTR)
+ error = 0;
+ return (error);
+ }
+
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
+ UIO_SYSSPACE, ".", xvp, td);
+ error = namei(&nd);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = 0;
+
+ do {
+ uint8_t nlen;
+
+ aiov.iov_base = (void *)dirbuf;
+ aiov.iov_len = sizeof (dirbuf);
+ auio.uio_resid = sizeof (dirbuf);
+ error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
+ done = sizeof (dirbuf) - auio.uio_resid;
+ if (error != 0)
+ break;
+ for (pos = 0; pos < done; ) {
+ dp = (struct dirent *)(dirbuf + pos);
+ pos += dp->d_reclen;
+ /*
+ * XXX: Temporarily we also accept DT_UNKNOWN, as this
+ * is what we get when attribute was created on Solaris.
+ */
+ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
+ continue;
+ if (plen == 0 &&
+ strncmp(dp->d_name, "freebsd:", 8) == 0)
+ continue;
+ else if (strncmp(dp->d_name, attrprefix, plen) != 0)
+ continue;
+ nlen = dp->d_namlen - plen;
+ if (sizep != NULL)
+ *sizep += 1 + nlen;
+ else if (uio != NULL) {
+ /*
+ * Format of extattr name entry is one byte for
+ * length and the rest for name.
+ */
+ error = uiomove(&nlen, 1, uio->uio_rw, uio);
+ if (error == 0) {
+ error = uiomove(dp->d_name + plen, nlen,
+ uio->uio_rw, uio);
+ }
+ if (error != 0)
+ break;
+ }
+ }
+ } while (!eof && error == 0);
+
+ vput(vp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getacl_args {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+int
+zfs_freebsd_getacl(struct vop_getacl_args *ap)
+{
+ int error;
+ vsecattr_t vsecattr;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
+ if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)))
+ return (error);
+
+ error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt);
+ if (vsecattr.vsa_aclentp != NULL)
+ kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setacl_args {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+int
+zfs_freebsd_setacl(struct vop_setacl_args *ap)
+{
+ int error;
+ vsecattr_t vsecattr;
+ int aclbsize; /* size of acl list in bytes */
+ aclent_t *aaclp;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ if (ap->a_aclp == NULL)
+ return (EINVAL);
+
+ if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
+ return (EINVAL);
+
+ /*
+ * With NFSv4 ACLs, chmod(2) may need to add additional entries,
+ * splitting every entry into two and appending "canonical six"
+ * entries at the end. Don't allow for setting an ACL that would
+ * cause chmod(2) to run out of ACL entries.
+ */
+ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
+ return (ENOSPC);
+
+ error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
+ if (error != 0)
+ return (error);
+
+ vsecattr.vsa_mask = VSA_ACE;
+ aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
+ vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+ aaclp = vsecattr.vsa_aclentp;
+ vsecattr.vsa_aclentsz = aclbsize;
+
+ aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
+ error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
+ kmem_free(aaclp, aclbsize);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_aclcheck_args {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+};
+#endif
+
+int
+zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+ vnode_t *covered_vp;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *zp = VTOZ(vp);
+ int ltype;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * If we are a snapshot mounted under .zfs, run the operation
+ * on the covered vnode.
+ */
+ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+ char name[MAXNAMLEN + 1];
+ znode_t *dzp;
+ size_t len;
+
+ error = zfs_znode_parent_and_name(zp, &dzp, name);
+ if (error == 0) {
+ len = strlen(name);
+ if (*ap->a_buflen < len)
+ error = SET_ERROR(ENOMEM);
+ }
+ if (error == 0) {
+ *ap->a_buflen -= len;
+ bcopy(name, ap->a_buf + *ap->a_buflen, len);
+ *ap->a_vpp = ZTOV(dzp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ZFS_EXIT(zfsvfs);
+
+ covered_vp = vp->v_mount->mnt_vnodecovered;
+#if __FreeBSD_version >= 1300045
+ enum vgetstate vs = vget_prep(covered_vp);
+#else
+ vhold(covered_vp);
+#endif
+ ltype = VOP_ISLOCKED(vp);
+ VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300045
+ error = vget_finish(covered_vp, LK_SHARED, vs);
+#else
+ error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
+#endif
+ if (error == 0) {
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+ ap->a_buf, ap->a_buflen);
+ vput(covered_vp);
+ }
+ vn_lock(vp, ltype | LK_RETRY);
+ if (VN_IS_DOOMED(vp))
+ error = SET_ERROR(ENOENT);
+ return (error);
+}
+
+#ifdef DIAGNOSTIC
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lock1_args {
+ struct vnode *a_vp;
+ int a_flags;
+ char *file;
+ int line;
+};
+#endif
+
+static int
+zfs_lock(struct vop_lock1_args *ap)
+{
+ vnode_t *vp;
+ znode_t *zp;
+ int err;
+
+#if __FreeBSD_version >= 1300064
+ err = vop_lock(ap);
+#else
+ err = vop_stdlock(ap);
+#endif
+ if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
+ vp = ap->a_vp;
+ zp = vp->v_data;
+ if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) &&
+ zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
+ VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
+ }
+ return (err);
+}
+#endif
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+struct vop_vector zfs_shareops;
+
+struct vop_vector zfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfs_freebsd_inactive,
+#if __FreeBSD_version >= 1300042
+ .vop_need_inactive = zfs_freebsd_need_inactive,
+#endif
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_access = zfs_freebsd_access,
+ .vop_allocate = VOP_EINVAL,
+ .vop_lookup = zfs_cache_lookup,
+ .vop_cachedlookup = zfs_freebsd_cachedlookup,
+ .vop_getattr = zfs_freebsd_getattr,
+ .vop_setattr = zfs_freebsd_setattr,
+ .vop_create = zfs_freebsd_create,
+ .vop_mknod = (vop_mknod_t *)zfs_freebsd_create,
+ .vop_mkdir = zfs_freebsd_mkdir,
+ .vop_readdir = zfs_freebsd_readdir,
+ .vop_fsync = zfs_freebsd_fsync,
+ .vop_open = zfs_freebsd_open,
+ .vop_close = zfs_freebsd_close,
+ .vop_rmdir = zfs_freebsd_rmdir,
+ .vop_ioctl = zfs_freebsd_ioctl,
+ .vop_link = zfs_freebsd_link,
+ .vop_symlink = zfs_freebsd_symlink,
+ .vop_readlink = zfs_freebsd_readlink,
+ .vop_read = zfs_freebsd_read,
+ .vop_write = zfs_freebsd_write,
+ .vop_remove = zfs_freebsd_remove,
+ .vop_rename = zfs_freebsd_rename,
+ .vop_pathconf = zfs_freebsd_pathconf,
+ .vop_bmap = zfs_freebsd_bmap,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_getextattr = zfs_getextattr,
+ .vop_deleteextattr = zfs_deleteextattr,
+ .vop_setextattr = zfs_setextattr,
+ .vop_listextattr = zfs_listextattr,
+ .vop_getacl = zfs_freebsd_getacl,
+ .vop_setacl = zfs_freebsd_setacl,
+ .vop_aclcheck = zfs_freebsd_aclcheck,
+ .vop_getpages = zfs_freebsd_getpages,
+ .vop_putpages = zfs_freebsd_putpages,
+ .vop_vptocnp = zfs_vptocnp,
+#if __FreeBSD_version >= 1300064
+#ifdef DIAGNOSTIC
+ .vop_lock1 = zfs_lock,
+#else
+ .vop_lock1 = vop_lock,
+#endif
+ .vop_unlock = vop_unlock,
+ .vop_islocked = vop_islocked,
+#else
+#ifdef DIAGNOSTIC
+ .vop_lock1 = zfs_lock,
+#endif
+#endif
+};
+VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
+
+struct vop_vector zfs_fifoops = {
+ .vop_default = &fifo_specops,
+ .vop_fsync = zfs_freebsd_fsync,
+ .vop_access = zfs_freebsd_access,
+ .vop_getattr = zfs_freebsd_getattr,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_read = VOP_PANIC,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_setattr = zfs_freebsd_setattr,
+ .vop_write = VOP_PANIC,
+ .vop_pathconf = zfs_freebsd_pathconf,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_getacl = zfs_freebsd_getacl,
+ .vop_setacl = zfs_freebsd_setacl,
+ .vop_aclcheck = zfs_freebsd_aclcheck,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
+
+/*
+ * special share hidden files vnode operations template
+ */
+struct vop_vector zfs_shareops = {
+ .vop_default = &default_vnodeops,
+ .vop_access = zfs_freebsd_access,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_pathconf = zfs_freebsd_pathconf,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_shareops);
diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c
new file mode 100644
index 000000000..b9d5472b2
--- /dev/null
+++ b/module/os/freebsd/zfs/zfs_znode.c
@@ -0,0 +1,1987 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2011 Martin Matuska <[email protected]> */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+#include <sys/refcount.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/* Used by fstat(1). */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+ SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)");
+
+/*
+ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define ZNODE_STATS
+#endif /* DEBUG */
+
+#ifdef ZNODE_STATS
+#define ZNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define ZNODE_STAT_ADD(stat) /* nothing */
+#endif /* ZNODE_STATS */
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+/*
+ * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
+ * be freed before it can be safely accessed.
+ */
+krwlock_t zfsvfs_lock;
+
+static kmem_cache_t *znode_cache = NULL;
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
+
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+ znode_t *zp = arg;
+
+ /*
+ * If in append mode, convert to writer and lock starting at the
+ * current end of file.
+ */
+ if (new->lr_type == RL_APPEND) {
+ new->lr_offset = zp->z_size;
+ new->lr_type = RL_WRITER;
+ }
+
+ /*
+ * If we need to grow the block size then lock the whole file range.
+ */
+ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+ new->lr_offset = 0;
+ new->lr_length = UINT64_MAX;
+ }
+}
+
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ POINTER_INVALIDATE(&zp->z_zfsvfs);
+
+ list_link_init(&zp->z_link_node);
+
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+ zp->z_acl_cached = NULL;
+ zp->z_vnode = NULL;
+ zp->z_moved = 0;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ ASSERT3P(zp->z_vnode, ==, NULL);
+ ASSERT(!list_link_active(&zp->z_link_node));
+ mutex_destroy(&zp->z_acl_lock);
+ zfs_rangelock_fini(&zp->z_rangelock);
+
+ ASSERT(zp->z_acl_cached == NULL);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+ // kmem_cache_set_move(znode_cache, zfs_znode_move);
+}
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+ rw_destroy(&zfsvfs_lock);
+}
+
+
+static int
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ zfs_acl_ids_t acl_ids;
+ vattr_t vattr;
+ znode_t *sharezp;
+ znode_t *zp;
+ int error;
+
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0555;
+ vattr.va_uid = crgetuid(kcred);
+ vattr.va_gid = crgetgid(kcred);
+
+ sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_moved = 0;
+ sharezp->z_unlinked = 0;
+ sharezp->z_atime_dirty = 0;
+ sharezp->z_zfsvfs = zfsvfs;
+ sharezp->z_is_sa = zfsvfs->z_use_sa;
+
+ VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+ kcred, NULL, &acl_ids));
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, sharezp);
+ POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+ error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+ zfsvfs->z_shares_dir = sharezp->z_id;
+
+ zfs_acl_ids_free(&acl_ids);
+ sa_handle_destroy(sharezp->z_sa_hdl);
+ kmem_cache_free(znode_cache, sharezp);
+
+ return (error);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define NBITSMINOR64 32
+#endif
+#ifndef MAXMAJ64
+#define MAXMAJ64 0xffffffffUL
+#endif
+#ifndef MAXMIN64
+#define MAXMIN64 0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want. The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+ return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64. We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+ return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
+
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+ /*
+ * Slap on VROOT if we are the root znode unless we are the root
+ * node of a snapshot mounted under .zfs.
+ */
+ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
+ ZTOV(zp)->v_flag |= VROOT;
+
+ vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
+ zp->z_unlinked ||
+ RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+}
+
+static void
+zfs_vnode_forget(vnode_t *vp)
+{
+
+ /* copied from insmntque_stddtr */
+ vp->v_data = NULL;
+ vp->v_op = &dead_vnodeops;
+ vgone(vp);
+ vput(vp);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+ znode_t *zp;
+ vnode_t *vp;
+ uint64_t mode;
+ uint64_t parent;
+#ifdef notyet
+ uint64_t mtime[2], ctime[2];
+#endif
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ sa_bulk_attr_t bulk[9];
+ int count = 0;
+ int error;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+
+#if __FreeBSD_version >= 1300076
+ KASSERT(curthread->td_vp_reserved != NULL,
+ ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#else
+ KASSERT(curthread->td_vp_reserv > 0,
+ ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#endif
+ error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+ if (error != 0) {
+ kmem_cache_free(znode_cache, zp);
+ return (NULL);
+ }
+ zp->z_vnode = vp;
+ vp->v_data = zp;
+
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ zp->z_moved = 0;
+
+ /*
+ * Defer setting z_zfsvfs until the znode is ready to be a candidate for
+ * the zfs_znode_move() callback.
+ */
+ zp->z_sa_hdl = NULL;
+ zp->z_unlinked = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_mapcnt = 0;
+ zp->z_id = db->db_object;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ vp = ZTOV(zp);
+
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, 16);
+#ifdef notyet
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+#endif
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, 8);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
+ (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ (zp->z_pflags & ZFS_PROJID) &&
+ sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ zfs_vnode_forget(vp);
+ zp->z_vnode = NULL;
+ kmem_cache_free(znode_cache, zp);
+ return (NULL);
+ }
+
+ zp->z_projid = projid;
+ zp->z_mode = mode;
+
+ /* Cache the xattr parent id */
+ if (zp->z_pflags & ZFS_XATTR)
+ zp->z_xattr_parent = parent;
+
+ vp->v_type = IFTOVT((mode_t)mode);
+
+ switch (vp->v_type) {
+ case VDIR:
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ break;
+ case VFIFO:
+ vp->v_op = &zfs_fifoops;
+ break;
+ case VREG:
+ if (parent == zfsvfs->z_shares_dir) {
+ ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
+ vp->v_op = &zfs_shareops;
+ }
+ break;
+ default:
+ break;
+ }
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes++;
+ membar_producer();
+ /*
+ * Everything else must be valid before assigning z_zfsvfs makes the
+ * znode eligible for zfs_znode_move().
+ */
+ zp->z_zfsvfs = zfsvfs;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * Acquire vnode lock before making it available to the world.
+ */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VN_LOCK_AREC(vp);
+ if (vp->v_type != VFIFO)
+ VN_LOCK_ASHARE(vp);
+
+ return (zp);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * bonuslen - length of bonus buffer
+ * setaclp - File/Dir initial ACL
+ * fuidp - Tracks fuid allocation.
+ *
+ * OUT: zpp - allocated znode
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t dzp_pflags = 0;
+ uint64_t rdev = 0;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ dmu_buf_t *db;
+ timestruc_t now;
+ uint64_t gen, obj;
+ int err;
+ int bonuslen;
+ int dnodesize;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t *sa_attrs;
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
+
+ ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE));
+
+ if (zfsvfs->z_replay) {
+ obj = vap->va_nodeid;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
+ } else {
+ obj = 0;
+ vfs_timestamp(&now);
+ gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+ }
+
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be needed to allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (vap->va_type == VDIR) {
+ if (zfsvfs->z_replay) {
+ VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = zap_create_norm_dnsize(zfsvfs->z_os,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ } else {
+ if (zfsvfs->z_replay) {
+ VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ }
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_id = obj;
+ } else {
+ dzp_pflags = dzp->z_pflags;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp_pflags & ZFS_XATTR) {
+ flag |= IS_XATTR;
+ }
+
+ if (zfsvfs->z_use_fuids)
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
+
+ if (vap->va_type == VDIR) {
+ size = 2; /* contents ("." and "..") */
+ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ } else {
+ size = links = 0;
+ }
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
+ if (flag & IS_XATTR)
+ pflags |= ZFS_XATTR;
+
+ /*
+ * No execs denied will be deterimed when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, mtime);
+ }
+
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+ NULL, &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+ NULL, &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (vap->va_type == VBLK || vap->va_type == VCHR)) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+ if (!(flag & IS_ROOT_NODE)) {
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ ASSERT(*zpp != NULL);
+ } else {
+ /*
+ * If we are creating the root node, the "parent" we
+ * passed in is the znode for the root.
+ */
+ *zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
+ }
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
+
+ if (vap->va_mask & AT_XVATTR)
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+ }
+ if (!(flag & IS_ROOT_NODE)) {
+ vnode_t *vp;
+
+ vp = ZTOV(*zpp);
+ vp->v_vflag |= VV_FORCEINSMQ;
+ err = insmntque(vp, zfsvfs->z_vfs);
+ vp->v_vflag &= ~VV_FORCEINSMQ;
+ KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ }
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+}
+
+/*
+ * Update in-core attributes. It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ xoptattr_t *xoap;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ &times, sizeof (times), tx);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ zfs_sa_set_scanstamp(zp, xvap, tx);
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ vnode_t *vp;
+ sa_handle_t *hdl;
+ struct thread *td;
+ int locked;
+ int err;
+
+ td = curthread;
+ getnewvnode_reserve_();
+again:
+ *zpp = NULL;
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (SET_ERROR(EINVAL));
+ }
+
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
+ /*
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
+ */
+ ASSERT3P(zp, !=, NULL);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_unlinked) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ vp = ZTOV(zp);
+ /*
+ * Don't let the vnode disappear after
+ * ZFS_OBJ_HOLD_EXIT.
+ */
+ VN_HOLD(vp);
+ *zpp = zp;
+ err = 0;
+ }
+
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ if (err) {
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ locked = VOP_ISLOCKED(vp);
+ VI_LOCK(vp);
+ if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
+ /*
+ * The vnode is doomed and this thread doesn't
+ * hold the exclusive lock on it, so the vnode
+ * must be being reclaimed by another thread.
+ * Otherwise the doomed vnode is being reclaimed
+ * by this thread and zfs_zget is called from
+ * ZIL internals.
+ */
+ VI_UNLOCK(vp);
+
+ /*
+ * XXX vrele() locks the vnode when the last reference
+ * is dropped. Although in this case the vnode is
+ * doomed / dead and so no inactivation is required,
+ * the vnode lock is still acquired. That could result
+ * in a LOR with z_teardown_lock if another thread holds
+ * the vnode's lock and tries to take z_teardown_lock.
+ * But that is only possible if the other thread peforms
+ * a ZFS vnode operation on the vnode. That either
+ * should not happen if the vnode is dead or the thread
+ * should also have a refrence to the vnode and thus
+ * our reference is not last.
+ */
+ VN_RELE(vp);
+ goto again;
+ }
+ VI_UNLOCK(vp);
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ * but only if file exists.
+ *
+ * There is a small window where zfs_vget() could
+ * find this object while a file create is still in
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ *zpp = zp;
+ }
+ if (err == 0) {
+ vnode_t *vp = ZTOV(zp);
+
+ err = insmntque(vp, zfsvfs->z_vfs);
+ if (err == 0) {
+ vp->v_hash = obj_num;
+ VOP_UNLOCK1(vp);
+ } else {
+ zp->z_vnode = NULL;
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ *zpp = NULL;
+ }
+ }
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ vnode_t *vp;
+ uint64_t obj_num = zp->z_id;
+ uint64_t mode, size;
+ sa_bulk_attr_t bulk[8];
+ int err;
+ int count = 0;
+ uint64_t gen;
+
+ /*
+ * Remove cached pages before reloading the znode, so that they are not
+ * lingering after we run into any error. Ideally, we should vgone()
+ * the vnode in case of error, but currently we cannot do that
+ * because of the LOR between the vnode lock and z_teardown_lock.
+ * So, instead, we have to "doom" the znode in the illumos style.
+ */
+ vp = ZTOV(zp);
+ vn_pages_remove(vp, 0, 0);
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EINVAL));
+ }
+
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+ size = zp->z_size;
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, sizeof (zp->z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, sizeof (zp->z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ zp->z_mode = mode;
+
+ if (gen != zp->z_gen) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * It is highly improbable but still quite possible that two
+ * objects in different datasets are created with the same
+ * object numbers and in transaction groups with the same
+ * numbers. znodes corresponding to those objects would
+ * have the same z_id and z_gen, but their other attributes
+ * may be different.
+ * zfs recv -F may replace one of such objects with the other.
+ * As a result file properties recorded in the replaced
+ * object's vnode may no longer match the received object's
+ * properties. At present the only cached property is the
+ * files type recorded in v_type.
+ * So, handle this case by leaving the old vnode and znode
+ * disassociated from the actual object. A new vnode and a
+ * znode will be created if the object is accessed
+ * (e.g. via a look-up). The old vnode and znode will be
+ * recycled when the last vnode reference is dropped.
+ */
+ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * If the file has zero links, then it has been unlinked on the send
+ * side and it must be in the received unlinked set.
+ * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+ * stale data and to prevent automatical removal of the file in
+ * zfs_zinactive(). The file will be removed either when it is removed
+ * on the send side and the next incremental stream is received or
+ * when the unlinked set gets processed.
+ */
+ zp->z_unlinked = (zp->z_links == 0);
+ if (zp->z_unlinked) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (0);
+ }
+
+ zp->z_blksz = doi.doi_data_block_size;
+ if (zp->z_size != size)
+ vnode_pager_setsize(vp, zp->z_size);
+
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ uint64_t obj = zp->z_id;
+ uint64_t acl_obj = zfs_external_acl(zp);
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
+ VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_sa_hdl);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ /*
+ * If this was the last reference to a file with no links, remove
+ * the file from the file system unless the file system is mounted
+ * read-only. That can happen, for example, if the file system was
+ * originally read-write, the file was opened, then unlinked and
+ * the file system was made read-only before the file was finally
+ * closed. The file will remain in the unlinked set.
+ */
+ if (zp->z_unlinked) {
+ ASSERT(!zfsvfs->z_issnap);
+ if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_rmnode(zp);
+ return;
+ }
+ }
+
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ zp->z_vnode = NULL;
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ POINTER_INVALIDATE(&zp->z_zfsvfs);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes--;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ kmem_cache_free(znode_cache, zp);
+
+}
+
+void
+zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2], boolean_t have_tx)
+{
+ timestruc_t now;
+
+ vfs_timestamp(&now);
+
+ if (have_tx) { /* will sa_bulk_update happen really soon? */
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_atime);
+ }
+
+ if (flag & AT_MTIME) {
+ ZFS_TIME_ENCODE(&now, mtime);
+ if (zp->z_zfsvfs->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
+ }
+
+ if (flag & AT_CTIME) {
+ ZFS_TIME_ENCODE(&now, ctime);
+ if (zp->z_zfsvfs->z_use_fuids)
+ zp->z_pflags |= ZFS_ARCHIVE;
+ }
+}
+
+
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2])
+{
+ zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE);
+}
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+
+ if (error == ENOTSUP)
+ return;
+ ASSERT0(error);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_tx_t *tx;
+ zfs_locked_range_t *lr;
+ uint64_t newblksz;
+ int error;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end <= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ if (end > zp->z_blksz &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+ } else {
+ newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+ } else {
+ newblksz = 0;
+ }
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+
+ if (newblksz)
+ zfs_grow_blocksize(zp, newblksz, tx);
+
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ vnode_pager_setsize(ZTOV(zp), end);
+
+ zfs_rangelock_exit(lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_locked_range_t *lr;
+ int error;
+
+ /*
+ * Lock the range being freed.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (off >= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+ if (error == 0) {
+ /*
+ * In FreeBSD we cannot free block in the middle of a file,
+ * but only at the end of a file, so this code path should
+ * never happen.
+ */
+ vnode_pager_setsize(ZTOV(zp), off);
+ }
+
+ zfs_rangelock_exit(lr);
+
+ return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfs_locked_range_t *lr;
+ int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end >= zp->z_size) {
+ zfs_rangelock_exit(lr);
+ return (0);
+ }
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
+ if (error) {
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_rangelock_exit(lr);
+ return (error);
+ }
+
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear any mapped pages in the truncated region. This has to
+ * happen outside of the transaction to avoid the possibility of
+ * a deadlock with someone trying to push a page that we are
+ * about to invalidate.
+ */
+ vnode_pager_setsize(vp, end);
+
+ zfs_rangelock_exit(lr);
+
+ return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of range
+ * len - end of range (0 => EOF)
+ * flag - current file open mode flags.
+ * log - TRUE if this action should be logged
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
+ error = zfs_extend(zp, off+len);
+ if (error == 0 && log)
+ goto log;
+ else
+ return (error);
+ }
+
+ if (len == 0) {
+ error = zfs_trunc(zp, off);
+ } else {
+ if ((error = zfs_free_range(zp, off, len)) == 0 &&
+ off + len > zp->z_size)
+ error = zfs_extend(zp, off+len);
+ }
+ if (error || !log)
+ return (error);
+log:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+ uint64_t moid, obj, sa_obj, version;
+ uint64_t sense = ZFS_CASE_SENSITIVE;
+ uint64_t norm = 0;
+ nvpair_t *elem;
+ int error;
+ int i;
+ znode_t *rootzp = NULL;
+ zfsvfs_t *zfsvfs;
+ vattr_t vattr;
+ znode_t *zp;
+ zfs_acl_ids_t acl_ids;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+ /* For the moment we expect all zpl props to be uint64_ts */
+ uint64_t val;
+ char *name;
+
+ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+ VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ name = nvpair_name(elem);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+ if (val < version)
+ version = val;
+ } else {
+ error = zap_update(os, moid, name, 8, 1, &val, tx);
+ }
+ ASSERT(error == 0);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+ norm = val;
+ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+ sense = val;
+ }
+ ASSERT(version != 0);
+ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+ /*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
+ * Create a delete queue.
+ */
+ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ VATTR_NULL(&vattr);
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = crgetuid(cr);
+ vattr.va_gid = crgetgid(cr);
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_moved = 0;
+ rootzp->z_unlinked = 0;
+ rootzp->z_atime_dirty = 0;
+ rootzp->z_is_sa = USE_SA(version, os);
+
+ zfsvfs->z_os = os;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_version = version;
+ zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs->z_use_sa = USE_SA(version, os);
+ zfsvfs->z_norm = norm;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+
+ ASSERT(error == 0);
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ rootzp->z_zfsvfs = zfsvfs;
+ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ cr, NULL, &acl_ids));
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, rootzp);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+ ASSERT(error == 0);
+ zfs_acl_ids_free(&acl_ids);
+ POINTER_INVALIDATE(&rootzp->z_zfsvfs);
+
+ sa_handle_destroy(rootzp->z_sa_hdl);
+ kmem_cache_free(znode_cache, rootzp);
+
+ /*
+ * Create shares directory
+ */
+
+ error = zfs_create_share_dir(zfsvfs, tx);
+
+ ASSERT(error == 0);
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
+{
+ dmu_object_info_t doi;
+ int error;
+
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t))) {
+ sa_buf_rele(*db, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
+
+ return (0);
+}
+
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ uint64_t *pobjp, int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ uint64_t parent_mode;
+ sa_bulk_attr_t bulk[3];
+ sa_handle_t *sa_hdl;
+ dmu_buf_t *sa_db;
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ /*
+ * When a link is removed its parent pointer is not changed and will
+ * be invalid. There are two cases where a link is removed but the
+ * file stays around, when it goes to the delete queue and when there
+ * are additional links.
+ */
+ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ /*
+ * Extended attributes can be applied to files, directories, etc.
+ * Otherwise the parent must be a directory.
+ */
+ if (!*is_xattrdir && !S_ISDIR(parent_mode))
+ return (SET_ERROR(EINVAL));
+
+ *pobjp = parent;
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+ sa_hdl = hdl;
+
+ uint64_t deleteq_obj;
+ VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+ error = zap_lookup_int(osp, deleteq_obj, obj);
+ if (error == 0) {
+ return (ESTALE);
+ } else if (error != ENOENT) {
+ return (error);
+ }
+ error = 0;
+
+ for (;;) {
+ uint64_t pobj;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir;
+
+ if (prevdb)
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+ if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj,
+ ZFS_DIRENT_OBJ(-1ULL), component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t parent;
+ int is_xattrdir;
+ int err;
+
+ /* Extended attributes should not be visible as regular files. */
+ if ((zp->z_pflags & ZFS_XATTR) != 0)
+ return (SET_ERROR(EINVAL));
+
+ err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+ &parent, &is_xattrdir);
+ if (err != 0)
+ return (err);
+ ASSERT0(is_xattrdir);
+
+ /* No name as this is a root object. */
+ if (parent == zp->z_id)
+ return (SET_ERROR(EINVAL));
+
+ err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+ ZFS_DIRENT_OBJ(-1ULL), buf);
+ if (err != 0)
+ return (err);
+ err = zfs_zget(zfsvfs, parent, dzpp);
+ return (err);
+}
+#endif /* _KERNEL */
diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c
new file mode 100644
index 000000000..d6496a7ef
--- /dev/null
+++ b/module/os/freebsd/zfs/zio_crypt.c
@@ -0,0 +1,1882 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000
+#define ZFS_CURRENT_MAX_SALT_USES \
+ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+/*
+ * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many
+ * calls, to test decryption error handling code paths.
+ */
+uint64_t zio_decrypt_fail_fraction = 0;
+
+typedef struct blkptr_auth_buf {
+ uint64_t bab_prop; /* blk_prop - portable mask */
+ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
+ uint64_t bab_pad; /* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+ {"", ZC_TYPE_NONE, 0, "inherit"},
+ {"", ZC_TYPE_NONE, 0, "on"},
+ {"", ZC_TYPE_NONE, 0, "off"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"}
+};
+
+static void
+zio_crypt_key_destroy_early(zio_crypt_key_t *key)
+{
+ rw_destroy(&key->zk_salt_lock);
+
+ /* free crypto templates */
+ bzero(&key->zk_session, sizeof (key->zk_session));
+
+ /* zero out sensitive data */
+ bzero(key, sizeof (zio_crypt_key_t));
+}
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+
+ freebsd_crypt_freesession(&key->zk_session);
+ zio_crypt_key_destroy_early(key);
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech __unused;
+ uint_t keydata_len;
+ zio_crypt_info_t *ci = NULL;
+
+ ASSERT(key != NULL);
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ ci = &zio_crypt_table[crypt];
+ if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+ ci->ci_crypt_type != ZC_TYPE_CCM)
+ return (ENOTSUP);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ bzero(key, sizeof (zio_crypt_key_t));
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ /* fill keydata buffers and salt with random data */
+ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for the ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ ci = &zio_crypt_table[crypt];
+ if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+ ci->ci_crypt_type != ZC_TYPE_CCM)
+ return (ENOTSUP);
+
+ ret = freebsd_crypt_newsession(&key->zk_session, ci,
+ &key->zk_current_key);
+ if (ret)
+ goto error;
+
+ key->zk_crypt = crypt;
+ key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+ key->zk_salt_count = 0;
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy_early(key);
+ return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+ int ret = 0;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ crypto_mechanism_t mech __unused;
+
+ uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+ /* generate a new salt */
+ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+ /* someone beat us to the salt rotation, just unlock and return */
+ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+ goto out_unlock;
+
+ /* derive the current key from the master key and the new salt */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+ if (ret != 0)
+ goto out_unlock;
+
+ /* assign the salt and reset the usage count */
+ bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+ key->zk_salt_count = 0;
+
+ freebsd_crypt_freesession(&key->zk_session);
+ ret = freebsd_crypt_newsession(&key->zk_session,
+ &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
+ if (ret != 0)
+ goto out_unlock;
+
+ rw_exit(&key->zk_salt_lock);
+
+ return (0);
+
+out_unlock:
+ rw_exit(&key->zk_salt_lock);
+error:
+ return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+ int ret;
+ boolean_t salt_change;
+
+ rw_enter(&key->zk_salt_lock, RW_READER);
+
+ bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+ salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+ ZFS_CURRENT_MAX_SALT_USES);
+
+ rw_exit(&key->zk_salt_lock);
+
+ if (salt_change) {
+ ret = zio_crypt_key_change_salt(key);
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+void *failed_decrypt_buf;
+int failed_decrypt_size;
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+/*
+ * The implemenation for FreeBSD's OpenCrypto.
+ *
+ * The big difference between ICP and FOC is that FOC uses a single
+ * buffer for input and output. This means that (for AES-GCM, the
+ * only one supported right now) the source must be copied into the
+ * destination, and the destination must have the AAD, and the tag/MAC,
+ * already associated with it. (Both implementations can use a uio.)
+ *
+ * Since the auth data is part of the iovec array, all we need to know
+ * is the length: 0 means there's no AAD.
+ *
+ */
+static int
+zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
+ uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
+ uio_t *uio, uint_t auth_len)
+{
+ zio_crypt_info_t *ci;
+ int ret;
+
+ ci = &zio_crypt_table[crypt];
+ if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+ ci->ci_crypt_type != ZC_TYPE_CCM)
+ return (ENOTSUP);
+
+
+ ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
+ datalen, auth_len);
+ if (ret != 0) {
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Returning error %s\n",
+ __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
+#endif
+ ret = SET_ERROR(encrypt ? EIO : ECKSUM);
+ }
+
+ return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+ uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+ int ret;
+ uint64_t aad[3];
+ /*
+ * With OpenCrypto in FreeBSD, the same buffer is used for
+ * input and output. Also, the AAD (for AES-GMC at least)
+ * needs to logically go in front.
+ */
+ uio_t cuio;
+ iovec_t iovecs[4];
+ uint64_t crypt = key->zk_crypt;
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* generate iv for wrapping the master and hmac key */
+ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * Since we only support one buffer, we need to copy
+ * the plain text (source) to the cipher buffer (dest).
+ * We set iovecs[0] -- the authentication data -- below.
+ */
+ bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len);
+ bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out,
+ SHA512_HMAC_KEYLEN);
+ iovecs[1].iov_base = keydata_out;
+ iovecs[1].iov_len = keydata_len;
+ iovecs[2].iov_base = hmac_keydata_out;
+ iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+ iovecs[3].iov_base = mac;
+ iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+ /*
+ * Although we don't support writing to the old format, we do
+ * support rewrapping the key so that the user can move and
+ * quarantine datasets on the old format.
+ */
+ if (key->zk_version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(key->zk_guid);
+ } else {
+ ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(key->zk_guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(key->zk_version);
+ }
+
+ iovecs[0].iov_base = aad;
+ iovecs[0].iov_len = aad_len;
+ enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+
+ cuio.uio_iov = iovecs;
+ cuio.uio_iovcnt = 4;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* encrypt the keys and store the resulting ciphertext and mac */
+ ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
+ iv, enc_len, &cuio, aad_len);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+ uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+ uint8_t *mac, zio_crypt_key_t *key)
+{
+ int ret;
+ uint64_t aad[3];
+ /*
+ * With OpenCrypto in FreeBSD, the same buffer is used for
+ * input and output. Also, the AAD (for AES-GMC at least)
+ * needs to logically go in front.
+ */
+ uio_t cuio;
+ iovec_t iovecs[4];
+ void *src, *dst;
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ /*
+ * Since we only support one buffer, we need to copy
+ * the encrypted buffer (source) to the plain buffer
+ * (dest). We set iovecs[0] -- the authentication data --
+ * below.
+ */
+ dst = key->zk_master_keydata;
+ src = keydata;
+
+ bcopy(src, dst, keydata_len);
+
+ dst = key->zk_hmac_keydata;
+ src = hmac_keydata;
+ bcopy(src, dst, SHA512_HMAC_KEYLEN);
+
+ iovecs[1].iov_base = key->zk_master_keydata;
+ iovecs[1].iov_len = keydata_len;
+ iovecs[2].iov_base = key->zk_hmac_keydata;
+ iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+ iovecs[3].iov_base = mac;
+ iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+ if (version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(guid);
+ } else {
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(version);
+ }
+
+ enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+ iovecs[0].iov_base = aad;
+ iovecs[0].iov_len = aad_len;
+
+ cuio.uio_iov = iovecs;
+ cuio.uio_iovcnt = 4;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* decrypt the keys and store the result in the output buffers */
+ ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
+ iv, enc_len, &cuio, aad_len);
+
+ if (ret != 0)
+ goto error;
+
+ /* generate a fresh salt */
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ ret = freebsd_crypt_newsession(&key->zk_session,
+ &zio_crypt_table[crypt], &key->zk_current_key);
+ if (ret != 0)
+ goto error;
+
+ key->zk_crypt = crypt;
+ key->zk_version = version;
+ key->zk_guid = guid;
+ key->zk_salt_count = 0;
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy_early(key);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+ int ret;
+
+ /* randomly generate the IV */
+ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ bzero(ivbuf, ZIO_DATA_IV_LEN);
+ return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+ uint8_t *digestbuf, uint_t digestlen)
+{
+ uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+ ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+ crypto_mac(&key->zk_hmac_key, data, datalen,
+ raw_digestbuf, SHA512_DIGEST_LENGTH);
+
+ bcopy(raw_digestbuf, digestbuf, digestlen);
+
+ return (0);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+ uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+ int ret;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ ret = zio_crypt_do_hmac(key, data, datalen,
+ digestbuf, SHA512_DIGEST_LENGTH);
+ if (ret != 0)
+ return (ret);
+
+ bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+ bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+ return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+ bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, val32);
+ } else {
+ bcopy(salt, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+ bcopy(iv, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, BSWAP_32(val32));
+ }
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+ bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+ val32 = (uint32_t)BP_GET_IV2(bp);
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+ bcopy(&val64, salt, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+ bcopy(&val64, iv, sizeof (uint64_t));
+
+ val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+ sizeof (uint64_t));
+ } else {
+ bcopy(mac, &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+ bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+ }
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+ bcopy(&val64, mac, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+ bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+ zil_chain_t *zilc = data;
+
+ bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+ sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+ /*
+ * The ZIL MAC is embedded in the block it protects, which will
+ * not have been byteswapped by the time this function has been called.
+ * As a result, we don't need to worry about byteswapping the MAC.
+ */
+ const zil_chain_t *zilc = data;
+
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+ uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+ uint8_t *src;
+ dnode_phys_t *dnp, *sdnp, *ddnp;
+
+ src = abd_borrow_buf_copy(src_abd, datalen);
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+ DN_MAX_BONUS_LEN(dnp));
+ }
+ }
+
+ abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+ int avoidlint = SPA_MINBLOCKSIZE;
+ /*
+ * Version 0 did not properly zero out all non-portable fields
+ * as it should have done. We maintain this code so that we can
+ * do read-only imports of pools on this version.
+ */
+ if (version == 0) {
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+ BP_SET_PSIZE(bp, avoidlint);
+ return;
+ }
+
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+ /*
+ * The hole_birth feature might set these fields even if this bp
+ * is a hole. We zero them out here to guarantee that raw sends
+ * will function with or without the feature.
+ */
+ if (BP_IS_HOLE(bp)) {
+ bp->blk_prop = 0ULL;
+ return;
+ }
+
+ /*
+ * At L0 we want to verify these fields to ensure that data blocks
+ * can not be reinterpreted. For instance, we do not want an attacker
+ * to trick us into returning raw lz4 compressed data to the user
+ * by modifying the compression bits. At higher levels, we cannot
+ * enforce this policy since raw sends do not convey any information
+ * about indirect blocks, so these values might be different on the
+ * receive side. Fortunately, this does not open any new attack
+ * vectors, since any alterations that can be made to a higher level
+ * bp must still verify the correct order of the layer below it.
+ */
+ if (BP_GET_LEVEL(bp) != 0) {
+ BP_SET_BYTEORDER(bp, 0);
+ BP_SET_COMPRESS(bp, 0);
+
+ /*
+ * psize cannot be set to zero or it will trigger
+ * asserts, but the value doesn't really matter as
+ * long as it is constant.
+ */
+ BP_SET_PSIZE(bp, avoidlint);
+ }
+
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+ blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+ blkptr_t tmpbp = *bp;
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+ zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+ /*
+ * We always MAC blk_prop in LE to ensure portability. This
+ * must be done after decoding the mac, since the endianness
+ * will get zero'd out here.
+ */
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+ bab->bab_prop = LE_64(tmpbp.blk_prop);
+ bab->bab_pad = 0ULL;
+
+ /* version 0 did not include the padding */
+ *bab_len = sizeof (blkptr_auth_buf_t);
+ if (version == 0)
+ *bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ crypto_mac_update(ctx, &bab, bab_len);
+
+ return (0);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ bcopy(&bab, *aadp, bab_len);
+ *aadp += bab_len;
+ *aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, dnode_phys_t *dnp)
+{
+ int ret, i;
+ dnode_phys_t *adnp;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+ /* authenticate the core dnode (masking out non-portable bits) */
+ bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+ adnp = (dnode_phys_t *)tmp_dncore;
+ if (le_bswap) {
+ adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+ adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+ adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+ adnp->dn_used = BSWAP_64(adnp->dn_used);
+ }
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+
+ crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, &dnp->dn_blkptr[i]);
+ if (ret != 0)
+ goto error;
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, DN_SPILL_BLKPTR(dnp));
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+/* ARGSUSED */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+ boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+ int ret;
+ struct hmac_ctx hash_ctx;
+ struct hmac_ctx *ctx = &hash_ctx;
+ objset_phys_t *osp = data;
+ uint64_t intval;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+ uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+
+ /* calculate the portable MAC from the portable fields and metadnode */
+ crypto_mac_init(ctx, &key->zk_hmac_key);
+
+ /* add in the os_type */
+ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+ crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+ /* add in the portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ /* CONSTCOND */
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+ /* add in fields from the metadnode */
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_meta_dnode);
+ if (ret)
+ goto error;
+
+ crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
+
+ bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+ /*
+ * The local MAC protects the user, group and project accounting.
+ * If these objects are not present, the local MAC is zeroed out.
+ */
+ if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen >= OBJSET_PHYS_SIZE_V2 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen <= OBJSET_PHYS_SIZE_V1)) {
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (0);
+ }
+
+ /* calculate the local MAC from the userused and groupused dnodes */
+ crypto_mac_init(ctx, &key->zk_hmac_key);
+
+ /* add in the non-portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ /* CONSTCOND */
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+ /* XXX check dnode type ... */
+ /* add in fields from the user accounting dnodes */
+ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_userused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_groupused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+ datalen >= OBJSET_PHYS_SIZE_V3) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_projectused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
+
+ bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+ return (0);
+
+error:
+ bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(uio_t *uio)
+{
+ if (uio->uio_iov)
+ kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+ uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+ blkptr_t *bp;
+ int i, epb = datalen >> SPA_BLKPTRSHIFT;
+ SHA2_CTX ctx;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ /* checksum all of the MACs from the layer below */
+ SHA2Init(SHA512, &ctx);
+ for (i = 0, bp = buf; i < epb; i++, bp++) {
+ zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+ byteswap, bp);
+ }
+ SHA2Final(digestbuf, &ctx);
+
+ if (generate) {
+ bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
+#endif
+ return (SET_ERROR(ECKSUM));
+ }
+ return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+
+ /*
+ * Unfortunately, callers of this function will not always have
+ * easy access to the on-disk format version. This info is
+ * normally found in the DSL Crypto Key, but the checksum-of-MACs
+ * is expected to be verifiable even when the key isn't loaded.
+ * Here, instead of doing a ZAP lookup for the version for each
+ * zio, we simply try both existing formats.
+ */
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+ datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+ if (ret == ECKSUM) {
+ ASSERT(!generate);
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+ buf, datalen, 0, byteswap, cksum);
+ }
+
+ return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+ void *buf;
+
+ buf = abd_borrow_buf_copy(abd, datalen);
+ ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+ byteswap, cksum);
+ abd_return_buf(abd, buf, datalen);
+
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+/*
+ * The OpenCrypto used in FreeBSD does not use seperate source and
+ * destination buffers; instead, the same buffer is used. Further, to
+ * accomodate some of the drivers, the authbuf needs to be logically before
+ * the data. This means that we need to copy the source to the destination,
+ * and set up an extra iovec_t at the beginning to handle the authbuf.
+ * It also means we'll only return one uio_t, which we do via the clumsy
+ * ifdef in the function declaration.
+ */
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
+ uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+ boolean_t *no_crypt)
+{
+ int ret;
+ uint64_t txtype, lr_len;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+ zil_chain_t *zilc;
+ lr_t *lr;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ /* cipherbuf always needs an extra iovec for the MAC */
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ /*
+ * We need at least two iovecs -- one for the AAD,
+ * one for the MAC.
+ */
+ bcopy(src, dst, datalen);
+ nr_dst = 2;
+
+ /* find the start and end record of the log block */
+ zilc = (zil_chain_t *)src;
+ slrp = src + sizeof (zil_chain_t);
+ aadp = aadbuf;
+ blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+ /* calculate the number of encrypted iovecs we will need */
+ for (; slrp < blkend; slrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ nr_iovecs++;
+ if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+ nr_iovecs++;
+ }
+
+ nr_src = 0;
+ nr_dst += nr_iovecs;
+
+ /* allocate the iovec arrays */
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (src_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ bzero(src_iovecs, nr_src * sizeof (iovec_t));
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (dst_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ bzero(dst_iovecs, nr_dst * sizeof (iovec_t));
+ }
+
+ /*
+ * Copy the plain zil header over and authenticate everything except
+ * the checksum that will store our MAC. If we are writing the data
+ * the embedded checksum will not have been calculated yet, so we don't
+ * authenticate that.
+ */
+ bcopy(src, dst, sizeof (zil_chain_t));
+ bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+ aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+ aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+ /* loop over records again, filling in iovecs */
+ /* The first one will contain the authbuf */
+ nr_iovecs = 1;
+
+ slrp = src + sizeof (zil_chain_t);
+ dlrp = dst + sizeof (zil_chain_t);
+
+ for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ /* copy the common lr_t */
+ bcopy(slrp, dlrp, sizeof (lr_t));
+ bcopy(slrp, aadp, sizeof (lr_t));
+ aadp += sizeof (lr_t);
+ aad_len += sizeof (lr_t);
+
+ ASSERT3P(dst_iovecs, !=, NULL);
+
+ /*
+ * If this is a TX_WRITE record we want to encrypt everything
+ * except the bp if exists. If the bp does exist we want to
+ * authenticate it.
+ */
+ if (txtype == TX_WRITE) {
+ crypt_len = sizeof (lr_write_t) -
+ sizeof (lr_t) - sizeof (blkptr_t);
+ dst_iovecs[nr_iovecs].iov_base = (char *)dlrp +
+ sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ /* copy the bp now since it will not be encrypted */
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ sizeof (blkptr_t));
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ aadp, sizeof (blkptr_t));
+ aadp += sizeof (blkptr_t);
+ aad_len += sizeof (blkptr_t);
+ nr_iovecs++;
+ total_len += crypt_len;
+
+ if (lr_len != sizeof (lr_write_t)) {
+ crypt_len = lr_len - sizeof (lr_write_t);
+ dst_iovecs[nr_iovecs].iov_base = (char *)
+ dlrp + sizeof (lr_write_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ } else {
+ crypt_len = lr_len - sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_base = (char *)dlrp +
+ sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+ dst_iovecs[0].iov_base = aadbuf;
+ dst_iovecs[0].iov_len = aad_len;
+
+ out_uio->uio_iov = dst_iovecs;
+ out_uio->uio_iovcnt = nr_dst;
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ out_uio->uio_iov = NULL;
+ out_uio->uio_iovcnt = 0;
+
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ uio_t *puio, uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *aadp;
+ dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ bcopy(src, dst, datalen);
+ nr_dst = 2;
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+ aadp = aadbuf;
+
+ /*
+ * Count the number of iovecs we will need to do the encryption by
+ * counting the number of bonus buffers that need to be encrypted.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ /*
+ * This block may still be byteswapped. However, all of the
+ * values we use are either uint8_t's (for which byteswapping
+ * is a noop) or a * != 0 check, which will work regardless
+ * of whether or not we byteswap.
+ */
+ if (sdnp[i].dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+ sdnp[i].dn_bonuslen != 0) {
+ nr_iovecs++;
+ }
+ }
+
+ nr_src = 0;
+ nr_dst += nr_iovecs;
+
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (src_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ bzero(src_iovecs, nr_src * sizeof (iovec_t));
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (dst_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ bzero(dst_iovecs, nr_dst * sizeof (iovec_t));
+ }
+
+ nr_iovecs = 1;
+
+ /*
+ * Iterate through the dnodes again, this time filling in the uios
+ * we allocated earlier. We also concatenate any data we want to
+ * authenticate onto aadbuf.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+
+ /* copy over the core fields and blkptrs (kept as plaintext) */
+ bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+ sizeof (blkptr_t));
+ }
+
+ /*
+ * Handle authenticated data. We authenticate everything in
+ * the dnode that can be brought over when we do a raw send.
+ * This includes all of the core fields as well as the MACs
+ * stored in the bp checksums and all of the portable bits
+ * from blk_prop. We include the dnode padding here in case it
+ * ever gets used in the future. Some dn_flags and dn_used are
+ * not portable so we mask those out values out of the
+ * authenticated data.
+ */
+ crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+ bcopy(dnp, aadp, crypt_len);
+ adnp = (dnode_phys_t *)aadp;
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+ aadp += crypt_len;
+ aad_len += crypt_len;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, &dnp->dn_blkptr[j]);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, DN_SPILL_BLKPTR(dnp));
+ }
+
+ /*
+ * If this bonus buffer needs to be encrypted, we prepare an
+ * iovec_t. The encryption / decryption functions will fill
+ * this in for us with the encrypted or decrypted data.
+ * Otherwise we add the bonus buffer to the authenticated
+ * data buffer and copy it over to the destination. The
+ * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+ * we can guarantee alignment with the AES block size
+ * (128 bits).
+ */
+ crypt_len = DN_MAX_BONUS_LEN(dnp);
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ ASSERT3U(nr_iovecs, <, nr_dst);
+ ASSERT3P(dst_iovecs, !=, NULL);
+ dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ nr_iovecs++;
+ total_len += crypt_len;
+ } else {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+ bcopy(DN_BONUS(dnp), aadp, crypt_len);
+ aadp += crypt_len;
+ aad_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ dst_iovecs[0].iov_base = aadbuf;
+ dst_iovecs[0].iov_len = aad_len;
+ out_uio->uio_iov = dst_iovecs;
+ out_uio->uio_iovcnt = nr_dst;
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ out_uio->uio_iov = NULL;
+ out_uio->uio_iovcnt = 0;
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *out_uio,
+ uint_t *enc_len)
+{
+ int ret;
+ uint_t nr_plain = 1, nr_cipher = 2;
+ iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+ void *src, *dst;
+
+ cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!cipher_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ }
+ bcopy(src, dst, datalen);
+ cipher_iovecs[0].iov_base = dst;
+ cipher_iovecs[0].iov_len = datalen;
+
+ *enc_len = datalen;
+ out_uio->uio_iov = cipher_iovecs;
+ out_uio->uio_iovcnt = nr_cipher;
+
+ return (0);
+
+error:
+ if (plain_iovecs != NULL)
+ kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+ if (cipher_iovecs != NULL)
+ kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ *enc_len = 0;
+ out_uio->uio_iov = NULL;
+ out_uio->uio_iovcnt = 0;
+
+ return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ iovec_t *mac_iov;
+
+ ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+ /* route to handler */
+ switch (ot) {
+ case DMU_OT_INTENT_LOG:
+ ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+ datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+ no_crypt);
+ break;
+ case DMU_OT_DNODE:
+ ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+ cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+ auth_len, no_crypt);
+ break;
+ default:
+ ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+ datalen, puio, cuio, enc_len);
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ break;
+ }
+
+ if (ret != 0)
+ goto error;
+
+ /* populate the uios */
+ cuio->uio_segflg = UIO_SYSSPACE;
+
+ mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+ mac_iov->iov_base = (void *)mac;
+ mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+void *failed_decrypt_buf;
+int faile_decrypt_size;
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+ dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+ uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+ boolean_t *no_crypt)
+{
+ int ret;
+ boolean_t locked = B_FALSE;
+ uint64_t crypt = key->zk_crypt;
+ uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+ uint_t enc_len, auth_len;
+ uio_t puio, cuio;
+ uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+ crypto_key_t tmp_ckey, *ckey = NULL;
+ freebsd_crypt_session_t *tmpl = NULL;
+ uint8_t *authbuf = NULL;
+
+ bzero(&puio, sizeof (uio_t));
+ bzero(&cuio, sizeof (uio_t));
+
+#ifdef FCRYPTO_DEBUG
+ printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
+ __FUNCTION__,
+ encrypt ? "encrypt" : "decrypt",
+ key, salt, ot, iv, mac, datalen,
+ byteswap ? "byteswap" : "native_endian", plainbuf,
+ cipherbuf, no_crypt);
+
+ printf("\tkey = {");
+ for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
+ printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
+ printf("}\n");
+#endif
+ /* create uios for encryption */
+ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+ cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+ &authbuf, &auth_len, no_crypt);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * If the needed key is the current one, just use it. Otherwise we
+ * need to generate a temporary one from the given salt + master key.
+ * If we are encrypting, we must return a copy of the current salt
+ * so that it can be stored in the blkptr_t.
+ */
+ rw_enter(&key->zk_salt_lock, RW_READER);
+ locked = B_TRUE;
+
+ if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+ ckey = &key->zk_current_key;
+ tmpl = &key->zk_session;
+ } else {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+ tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+ tmp_ckey.ck_data = enc_keydata;
+ tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ ckey = &tmp_ckey;
+ tmpl = NULL;
+ }
+
+ /* perform the encryption / decryption */
+ ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
+ ckey, iv, enc_len, &cuio, auth_len);
+ if (ret != 0)
+ goto error;
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (0);
+
+error:
+ if (!encrypt) {
+ if (failed_decrypt_buf != NULL)
+ kmem_free(failed_decrypt_buf, failed_decrypt_size);
+ failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
+ failed_decrypt_size = datalen;
+ bcopy(cipherbuf, failed_decrypt_buf, datalen);
+ }
+ if (locked)
+ rw_exit(&key->zk_salt_lock);
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+ return (SET_ERROR(ret));
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+ boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+ uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+ int ret;
+ void *ptmp, *ctmp;
+
+ if (encrypt) {
+ ptmp = abd_borrow_buf_copy(pabd, datalen);
+ ctmp = abd_borrow_buf(cabd, datalen);
+ } else {
+ ptmp = abd_borrow_buf(pabd, datalen);
+ ctmp = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+ datalen, ptmp, ctmp, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (0);
+
+error:
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (SET_ERROR(ret));
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+ "can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
new file mode 100644
index 000000000..bef97a9b3
--- /dev/null
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -0,0 +1,1476 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <[email protected]>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2011 Martin Matuska <[email protected]> */
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ *
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system. Except when they're simply character devices (volmode=dev).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/disk.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/byteorder.h>
+#include <sys/sunddi.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/queue.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/refcount.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zvol.h>
+#include <sys/zil_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil_impl.h>
+#include <sys/filio.h>
+
+#include <geom/geom.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include "zfs_namecheck.h"
+
+#define ZVOL_DUMPSIZE "dumpsize"
+
+#ifdef ZVOL_LOCK_DEBUG
+#define ZVOL_RW_READER RW_WRITER
+#define ZVOL_RW_READ_HELD RW_WRITE_HELD
+#else
+#define ZVOL_RW_READER RW_READER
+#define ZVOL_RW_READ_HELD RW_READ_HELD
+#endif
+
+enum zvol_geom_state {
+ ZVOL_GEOM_UNINIT,
+ ZVOL_GEOM_STOPPED,
+ ZVOL_GEOM_RUNNING,
+};
+
+struct zvol_state_os {
+ int zso_volmode;
+#define zso_dev _zso_state._zso_dev
+#define zso_geom _zso_state._zso_geom
+ union {
+ /* volmode=dev */
+ struct zvol_state_dev {
+ struct cdev *zsd_cdev;
+ uint64_t zsd_sync_cnt;
+ } _zso_dev;
+
+ /* volmode=geom */
+ struct zvol_state_geom {
+ struct g_provider *zsg_provider;
+ struct bio_queue_head zsg_queue;
+ struct mtx zsg_queue_mtx;
+ enum zvol_geom_state zsg_state;
+ } _zso_geom;
+ } _zso_state;
+};
+
+struct proc *zfsproc;
+
+static uint32_t zvol_minors;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
+ "Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+ "Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+ &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS / 2;
+
+static void zvol_ensure_zilog(zvol_state_t *zv);
+
+static d_open_t zvol_cdev_open;
+static d_close_t zvol_cdev_close;
+static d_ioctl_t zvol_cdev_ioctl;
+static d_read_t zvol_cdev_read;
+static d_write_t zvol_cdev_write;
+static d_strategy_t zvol_geom_bio_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+ .d_name = "zvol",
+ .d_version = D_VERSION,
+ .d_flags = D_DISK | D_TRACKCLOSE,
+ .d_open = zvol_cdev_open,
+ .d_close = zvol_cdev_close,
+ .d_ioctl = zvol_cdev_ioctl,
+ .d_read = zvol_cdev_read,
+ .d_write = zvol_cdev_write,
+ .d_strategy = zvol_geom_bio_strategy,
+};
+
+extern uint_t zfs_geom_probe_vdev_key;
+
+struct g_class zfs_zvol_class = {
+ .name = "ZFS::ZVOL",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+static int zvol_geom_open(struct g_provider *pp, int flag, int count);
+static int zvol_geom_close(struct g_provider *pp, int flag, int count);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_worker(void *arg);
+static void zvol_geom_bio_start(struct bio *bp);
+static int zvol_geom_bio_getattr(struct bio *bp);
+static void zvol_geom_bio_check_zilog(struct bio *bp);
+/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
+
+/*
+ * GEOM mode implementation
+ */
+
+/*ARGSUSED*/
+static int
+zvol_geom_open(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ int err = 0;
+ boolean_t drop_suspend = B_TRUE;
+
+ if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+ /*
+ * if zfs_geom_probe_vdev_key is set, that means that zfs is
+ * attempting to probe geom providers while looking for a
+ * replacement for a missing VDEV. In this case, the
+ * spa_namespace_lock will not be held, but it is still illegal
+ * to use a zvol as a vdev. Deadlocks can result if another
+ * thread has spa_namespace_lock
+ */
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = pp->private;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ err = zvol_first_open(zv, !(flag & FWRITE));
+ if (err)
+ goto out_mutex;
+ pp->mediasize = zv->zv_volsize;
+ pp->stripeoffset = 0;
+ pp->stripesize = zv->zv_volblocksize;
+ }
+
+ /*
+ * Check for a bad on-disk format version now since we
+ * lied about owning the dataset readonly before.
+ */
+ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
+ dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
+ err = EROFS;
+ goto out_open_count;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = EBUSY;
+ goto out_open_count;
+ }
+#ifdef FEXCL
+ if (flag & FEXCL) {
+ if (zv->zv_open_count != 0) {
+ err = EBUSY;
+ goto out_open_count;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_open_count += count;
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+
+out_open_count:
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
+out_mutex:
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (SET_ERROR(err));
+}
+
+/*ARGSUSED*/
+static int
+zvol_geom_close(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = pp->private;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_open_count == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_open_count > 0);
+
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if ((zv->zv_open_count - count) == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_open_count -= count;
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ zvol_last_close(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+ g_error_provider(pp, 0);
+
+ kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
+ "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+ g_topology_assert();
+
+ mutex_enter(&zv->zv_state_lock);
+ VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
+ mutex_exit(&zv->zv_state_lock);
+ zsg->zsg_provider = NULL;
+ pp->private = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+}
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ int count, error, flags;
+
+ g_topology_assert();
+
+ /*
+ * To make it easier we expect either open or close, but not both
+ * at the same time.
+ */
+ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+ (acr <= 0 && acw <= 0 && ace <= 0),
+ ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+ pp->name, acr, acw, ace));
+
+ if (pp->private == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ /*
+ * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
+ * ace != 0, because GEOM already handles that and handles it a bit
+ * differently. GEOM allows for multiple read/exclusive consumers and
+ * ZFS allows only one exclusive consumer, no matter if it is reader or
+ * writer. I like better the way GEOM works so I'll leave it for GEOM
+ * to decide what to do.
+ */
+
+ count = acr + acw + ace;
+ if (count == 0)
+ return (0);
+
+ flags = 0;
+ if (acr != 0 || ace != 0)
+ flags |= FREAD;
+ if (acw != 0)
+ flags |= FWRITE;
+
+ g_topology_unlock();
+ if (count > 0)
+ error = zvol_geom_open(pp, flags, count);
+ else
+ error = zvol_geom_close(pp, flags, -count);
+ g_topology_lock();
+ return (error);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+ zvol_state_t *zv = arg;
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct bio *bp;
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+ thread_lock(curthread);
+ sched_prio(curthread, PRIBIO);
+ thread_unlock(curthread);
+
+ for (;;) {
+ mtx_lock(&zsg->zsg_queue_mtx);
+ bp = bioq_takefirst(&zsg->zsg_queue);
+ if (bp == NULL) {
+ if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
+ zsg->zsg_state = ZVOL_GEOM_RUNNING;
+ wakeup(&zsg->zsg_state);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ kthread_exit();
+ }
+ msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
+ PRIBIO | PDROP, "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_geom_bio_check_zilog(bp);
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_DELETE:
+ zvol_geom_bio_strategy(bp);
+ break;
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+ rw_exit(&zv->zv_suspend_lock);
+ }
+}
+
+static void
+zvol_geom_bio_start(struct bio *bp)
+{
+ zvol_state_t *zv = bp->bio_to->private;
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ boolean_t first;
+
+ if (bp->bio_cmd == BIO_GETATTR) {
+ if (zvol_geom_bio_getattr(bp))
+ g_io_deliver(bp, EOPNOTSUPP);
+ return;
+ }
+
+ if (!THREAD_CAN_SLEEP()) {
+ mtx_lock(&zsg->zsg_queue_mtx);
+ first = (bioq_first(&zsg->zsg_queue) == NULL);
+ bioq_insert_tail(&zsg->zsg_queue, bp);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ if (first)
+ wakeup_one(&zsg->zsg_queue);
+ return;
+ }
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_geom_bio_check_zilog(bp);
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_DELETE:
+ zvol_geom_bio_strategy(bp);
+ break;
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+ rw_exit(&zv->zv_suspend_lock);
+}
+
+static int
+zvol_geom_bio_getattr(struct bio *bp)
+{
+ zvol_state_t *zv;
+
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (g_handleattr_int(bp, "GEOM::candelete", 1))
+ return (0);
+ if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksavail",
+ avail / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
+ return (0);
+ }
+ return (1);
+}
+
+static void
+zvol_geom_bio_check_zilog(struct bio *bp)
+{
+ zvol_state_t *zv;
+
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ case BIO_WRITE:
+ case BIO_DELETE:
+ zvol_ensure_zilog(zv);
+ default:
+ break;
+ }
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp)
+{
+ zvol_state_t *zv;
+ uint64_t off, volsize;
+ size_t resid;
+ char *addr;
+ objset_t *os;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ boolean_t doread = 0;
+ boolean_t is_dumpified;
+ boolean_t sync;
+
+ if (bp->bio_to)
+ zv = bp->bio_to->private;
+ else
+ zv = bp->bio_dev->si_drv2;
+
+ if (zv == NULL) {
+ error = SET_ERROR(ENXIO);
+ goto out;
+ }
+
+ if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+ error = SET_ERROR(EROFS);
+ goto out;
+ }
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ goto sync;
+ case BIO_READ:
+ doread = 1;
+ case BIO_WRITE:
+ case BIO_DELETE:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ goto out;
+ }
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT(os != NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ if (resid > 0 && (off < 0 || off >= volsize)) {
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ is_dumpified = B_FALSE;
+ sync = !doread && !is_dumpified &&
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ /*
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
+ */
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
+ doread ? RL_READER : RL_WRITER);
+
+ if (bp->bio_cmd == BIO_DELETE) {
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, off, resid, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ off, resid);
+ resid = 0;
+ }
+ goto unlock;
+ }
+ while (resid != 0 && off < volsize) {
+ size_t size = MIN(resid, zvol_maxphys);
+ if (doread) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size, sync);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+unlock:
+ zfs_rangelock_exit(lr);
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length && off > volsize)
+ error = EINVAL;
+
+ if (sync) {
+sync:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+out:
+ if (bp->bio_to)
+ g_io_deliver(bp, error);
+ else
+ biofinish(bp, NULL, error);
+}
+
+/*
+ * Character device mode implementation
+ */
+
+static int
+zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ zvol_state_t *zv;
+ uint64_t volsize;
+ zfs_locked_range_t *lr;
+ int error = 0;
+
+ zv = dev->si_drv2;
+
+ volsize = zv->zv_volsize;
+ /*
+ * uio_loffset == volsize isn't an error as
+ * its required for EOF processing.
+ */
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+ return (SET_ERROR(EIO));
+
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
+ uio->uio_resid, RL_READER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - uio->uio_loffset)
+ bytes = volsize - uio->uio_loffset;
+
+ error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ }
+ zfs_rangelock_exit(lr);
+
+ return (error);
+}
+
+static int
+zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ zvol_state_t *zv;
+ uint64_t volsize;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ boolean_t sync;
+
+ zv = dev->si_drv2;
+
+ volsize = zv->zv_volsize;
+
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+ return (SET_ERROR(EIO));
+
+ sync = (ioflag & IO_SYNC) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_ensure_zilog(zv);
+ rw_exit(&zv->zv_suspend_lock);
+
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
+ uio->uio_resid, RL_WRITER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+ uint64_t off = uio->uio_loffset;
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
+ if (error == 0)
+ zvol_log_write(zv, tx, off, bytes, sync);
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ zfs_rangelock_exit(lr);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ return (error);
+}
+
+static int
+zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv;
+ struct zvol_state_dev *zsd;
+ int err = 0;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = dev->si_drv2;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
+
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ err = zvol_first_open(zv, !(flags & FWRITE));
+ if (err)
+ goto out_locked;
+ }
+
+ if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = EROFS;
+ goto out_opened;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = EBUSY;
+ goto out_opened;
+ }
+#ifdef FEXCL
+ if (flags & FEXCL) {
+ if (zv->zv_open_count != 0) {
+ err = EBUSY;
+ goto out_opened;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_open_count++;
+ if (flags & (FSYNC | FDSYNC)) {
+ zsd = &zv->zv_zso->zso_dev;
+ zsd->zsd_sync_cnt++;
+ if (zsd->zsd_sync_cnt == 1)
+ zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+
+out_opened:
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
+out_locked:
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (SET_ERROR(err));
+}
+
+static int
+zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv;
+ struct zvol_state_dev *zsd;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = dev->si_drv2;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_open_count == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_open_count > 0);
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 1) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_open_count--;
+ if (flags & (FSYNC | FDSYNC)) {
+ zsd = &zv->zv_zso->zso_dev;
+ zsd->zsd_sync_cnt--;
+ }
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ zvol_last_close(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+}
+
+static int
+zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
+ int fflag, struct thread *td)
+{
+ zvol_state_t *zv;
+ zfs_locked_range_t *lr;
+ off_t offset, length;
+ int i, error;
+ boolean_t sync;
+
+ zv = dev->si_drv2;
+
+ error = 0;
+ KASSERT(zv->zv_open_count > 0,
+ ("Device with zero access count in %s", __func__));
+
+ i = IOCPARM_LEN(cmd);
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(uint32_t *)data = DEV_BSIZE;
+ break;
+ case DIOCGMEDIASIZE:
+ *(off_t *)data = zv->zv_volsize;
+ break;
+ case DIOCGFLUSH:
+ if (zv->zv_zilog != NULL)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ break;
+ case DIOCGDELETE:
+ if (!zvol_unmap_enabled)
+ break;
+
+ offset = ((off_t *)data)[0];
+ length = ((off_t *)data)[1];
+ if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+ offset < 0 || offset >= zv->zv_volsize ||
+ length <= 0) {
+ printf("%s: offset=%jd length=%jd\n", __func__, offset,
+ length);
+ error = EINVAL;
+ break;
+ }
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_ensure_zilog(zv);
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
+ RL_WRITER);
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ sync = FALSE;
+ dmu_tx_abort(tx);
+ } else {
+ sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+ zvol_log_truncate(zv, tx, offset, length, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ offset, length);
+ }
+ zfs_rangelock_exit(lr);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ rw_exit(&zv->zv_suspend_lock);
+ break;
+ case DIOCGSTRIPESIZE:
+ *(off_t *)data = zv->zv_volblocksize;
+ break;
+ case DIOCGSTRIPEOFFSET:
+ *(off_t *)data = 0;
+ break;
+ case DIOCGATTR: {
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (strcmp(arg->name, "GEOM::candelete") == 0)
+ arg->value.i = 1;
+ else if (strcmp(arg->name, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = refd / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(
+ spa_normal_class(spa));
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ arg->value.off = refd / DEV_BSIZE;
+ } else
+ error = ENOIOCTL;
+ break;
+ }
+ case FIOSEEKHOLE:
+ case FIOSEEKDATA: {
+ off_t *off = (off_t *)data;
+ uint64_t noff;
+ boolean_t hole;
+
+ hole = (cmd == FIOSEEKHOLE);
+ noff = *off;
+ error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+ *off = noff;
+ break;
+ }
+ default:
+ error = ENOIOCTL;
+ }
+
+ return (error);
+}
+
+/*
+ * Misc. helpers
+ */
+
+static void
+zvol_ensure_zilog(zvol_state_t *zv)
+{
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+
+ /*
+ * Open a ZIL if this is the first time we have written to this
+ * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+ * than zv_state_lock so that we don't need to acquire an
+ * additional lock in this path.
+ */
+ if (zv->zv_zilog == NULL) {
+ rw_exit(&zv->zv_suspend_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ if (zv->zv_zilog == NULL) {
+ zv->zv_zilog = zil_open(zv->zv_objset,
+ zvol_get_data);
+ zv->zv_flags |= ZVOL_WRITTEN_TO;
+ }
+ rw_downgrade(&zv->zv_suspend_lock);
+ }
+}
+
+static boolean_t
+zvol_is_zvol_impl(const char *device)
+{
+ return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /* move to new hashtable entry */
+ zv->zv_hash = zvol_name_hash(zv->zv_name);
+ hlist_del(&zv->zv_hlink);
+ hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+ if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+ struct g_geom *gp;
+
+ g_topology_lock();
+ gp = pp->geom;
+ ASSERT(gp != NULL);
+
+ zsg->zsg_provider = NULL;
+ g_wither_provider(pp, ENXIO);
+
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = zv->zv_volsize;
+ pp->private = zv;
+ zsg->zsg_provider = pp;
+ g_error_provider(pp, 0);
+ g_topology_unlock();
+ } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev;
+ struct make_dev_args args;
+
+ dev = zsd->zsd_cdev;
+ if (dev != NULL) {
+ destroy_dev(dev);
+ dev = zsd->zsd_cdev = NULL;
+ if (zv->zv_open_count > 0) {
+ zv->zv_flags &= ~ZVOL_EXCL;
+ zv->zv_open_count = 0;
+ /* XXX need suspend lock but lock order */
+ zvol_last_close(zv);
+ }
+ }
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
+ == 0) {
+ dev->si_iosize_max = MAXPHYS;
+ zsd->zsd_cdev = dev;
+ }
+ }
+ strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count == 0);
+
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
+
+ if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+
+ g_topology_lock();
+ zvol_geom_destroy(zv);
+ g_topology_unlock();
+ mtx_destroy(&zsg->zsg_queue_mtx);
+ } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev = zsd->zsd_cdev;
+
+ if (dev != NULL)
+ destroy_dev(dev);
+ }
+
+ mutex_destroy(&zv->zv_state_lock);
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+ zvol_minors--;
+}
+
+/*
+ * Create a minor node (plus a whole lot more) for the specified volume.
+ */
+static int
+zvol_create_minor_impl(const char *name)
+{
+ zvol_state_t *zv;
+ objset_t *os;
+ dmu_object_info_t *doi;
+ uint64_t volsize;
+ uint64_t volmode, hash;
+ int error;
+
+ ZFS_LOG(1, "Creating ZVOL %s...", name);
+
+ hash = zvol_name_hash(name);
+ if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ mutex_exit(&zv->zv_state_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ DROP_GIANT();
+ /* lie and say we're read-only */
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+ if (error)
+ goto out_doi;
+
+ error = dmu_object_info(os, ZVOL_OBJ, doi);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
+ if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT)
+ volmode = zvol_volmode;
+ /*
+ * zvol_alloc equivalent ...
+ */
+ zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+ zv->zv_hash = hash;
+ mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+ zv->zv_zso->zso_volmode = volmode;
+ if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp;
+ struct g_geom *gp;
+
+ zsg->zsg_state = ZVOL_GEOM_UNINIT;
+ mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
+
+ g_topology_lock();
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_bio_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ /* TODO: NULL check? */
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = 0;
+ pp->private = zv;
+
+ zsg->zsg_provider = pp;
+ bioq_init(&zsg->zsg_queue);
+ } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev;
+ struct make_dev_args args;
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+ if (error != 0) {
+ mutex_destroy(&zv->zv_state_lock);
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (*zv));
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ goto out_giant;
+ }
+ dev->si_iosize_max = MAXPHYS;
+ zsd->zsd_cdev = dev;
+ }
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+ zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+ if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
+
+ zv->zv_volblocksize = doi->doi_data_block_size;
+ zv->zv_volsize = volsize;
+ zv->zv_objset = os;
+
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+
+ /* XXX do prefetch */
+
+ zv->zv_objset = NULL;
+out_dmu_objset_disown:
+ dmu_objset_disown(os, B_TRUE, FTAG);
+
+ if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ if (error == 0)
+ zvol_geom_run(zv);
+ g_topology_unlock();
+ }
+out_doi:
+ kmem_free(doi, sizeof (dmu_object_info_t));
+ if (error == 0) {
+ rw_enter(&zvol_state_lock, RW_WRITER);
+ zvol_insert(zv);
+ zvol_minors++;
+ rw_exit(&zvol_state_lock);
+ }
+ ZFS_LOG(1, "ZVOL %s created.", name);
+out_giant:
+ PICKUP_GIANT();
+ return (error);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ if (pp == NULL) /* XXX when? */
+ return;
+
+ mtx_lock(&zsg->zsg_queue_mtx);
+ zsg->zsg_state = ZVOL_GEOM_STOPPED;
+ pp->private = NULL;
+ wakeup_one(&zsg->zsg_queue);
+ while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
+ msleep(&zsg->zsg_state,
+ &zsg->zsg_queue_mtx,
+ 0, "zvol:w", 0);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ }
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+ zv->zv_volsize = volsize;
+ if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ if (pp == NULL) /* XXX when? */
+ return (0);
+
+ g_topology_lock();
+
+ /*
+ * Do not invoke resize event when initial size was zero.
+ * ZVOL initializes the size on first open, this is not
+ * real resizing.
+ */
+ if (pp->mediasize == 0)
+ pp->mediasize = zv->zv_volsize;
+ else
+ g_resize_provider(pp, zv->zv_volsize);
+
+ g_topology_unlock();
+ }
+ return (0);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+ // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+ // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_freebsd_ops = {
+ .zv_free = zvol_free,
+ .zv_rename_minor = zvol_rename_minor,
+ .zv_create_minor = zvol_create_minor_impl,
+ .zv_update_volsize = zvol_update_volsize,
+ .zv_clear_private = zvol_clear_private,
+ .zv_is_zvol = zvol_is_zvol_impl,
+ .zv_set_disk_ro = zvol_set_disk_ro_impl,
+ .zv_set_capacity = zvol_set_capacity_impl,
+};
+
+/*
+ * Public interfaces
+ */
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+int
+zvol_init(void)
+{
+ zvol_init_impl();
+ zvol_register_ops(&zvol_freebsd_ops);
+ return (0);
+}
+
+void
+zvol_fini(void)
+{
+ zvol_fini_impl();
+}