diff options
author | Matthew Macy <[email protected]> | 2020-04-14 11:36:28 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-04-14 11:36:28 -0700 |
commit | 9f0a21e6411aa0bac23fba0ddb220342a48c7cc7 (patch) | |
tree | 63e77f57396f27dbe5c69dc11e7aeb37f9008bfc /module/os | |
parent | 75c62019f3938e7bc81becb4fb2d5b5eb523e79a (diff) |
Add FreeBSD support to OpenZFS
Add the FreeBSD platform code to the OpenZFS repository. As of this
commit the source can be compiled and tested on FreeBSD 11 and 12.
Subsequent commits are now required to compile on FreeBSD and Linux.
Additionally, they must pass the ZFS Test Suite on FreeBSD which is
being run by the CI. As of this commit 1230 tests pass on FreeBSD
and there are no unexpected failures.
Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Jorgen Lundman <[email protected]>
Reviewed-by: Richard Laager <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Co-authored-by: Ryan Moeller <[email protected]>
Signed-off-by: Matt Macy <[email protected]>
Signed-off-by: Ryan Moeller <[email protected]>
Closes #898
Closes #8987
Diffstat (limited to 'module/os')
53 files changed, 33050 insertions, 0 deletions
diff --git a/module/os/freebsd/spl/acl_common.c b/module/os/freebsd/spl/acl_common.c new file mode 100644 index 000000000..8eea4695e --- /dev/null +++ b/module/os/freebsd/spl/acl_common.c @@ -0,0 +1,1731 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/avl.h> +#include <sys/misc.h> +#if defined(_KERNEL) +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <acl/acl_common.h> +#include <sys/debug.h> +#else +#include <errno.h> +#include <stdlib.h> +#include <stddef.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <grp.h> +#include <pwd.h> +#include <acl_common.h> +#define ASSERT assert +#endif + +#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \ + ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \ + ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL) + + +#define ACL_SYNCHRONIZE_SET_DENY 0x0000001 +#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002 +#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004 +#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008 + +#define ACL_WRITE_OWNER_SET_DENY 0x0000010 +#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020 +#define ACL_WRITE_OWNER_ERR_DENY 0x0000040 +#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080 + +#define ACL_DELETE_SET_DENY 0x0000100 +#define ACL_DELETE_SET_ALLOW 0x0000200 +#define ACL_DELETE_ERR_DENY 0x0000400 +#define ACL_DELETE_ERR_ALLOW 0x0000800 + +#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000 +#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000 +#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000 +#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000 + +#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000 +#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000 +#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000 +#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000 + +#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000 +#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000 +#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000 +#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000 + +#define ACL_READ_NAMED_READER_SET_DENY 0x1000000 +#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000 +#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000 +#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000 + + +#define ACE_VALID_MASK_BITS (\ + ACE_READ_DATA | \ + ACE_LIST_DIRECTORY | \ + ACE_WRITE_DATA | \ + ACE_ADD_FILE | \ + ACE_APPEND_DATA | \ + ACE_ADD_SUBDIRECTORY | \ + ACE_READ_NAMED_ATTRS | \ + ACE_WRITE_NAMED_ATTRS | \ + ACE_EXECUTE | \ + ACE_DELETE_CHILD | \ + ACE_READ_ATTRIBUTES | \ + ACE_WRITE_ATTRIBUTES | \ + ACE_DELETE | \ + ACE_READ_ACL | \ + ACE_WRITE_ACL | \ + ACE_WRITE_OWNER | \ + ACE_SYNCHRONIZE) + +#define ACE_MASK_UNDEFINED 0x80000000 + +#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \ + ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \ + ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE) + +/* + * ACL conversion helpers + */ + +typedef enum { + ace_unused, + ace_user_obj, + ace_user, + ace_group, /* includes GROUP and GROUP_OBJ */ + ace_other_obj +} ace_to_aent_state_t; + +typedef struct acevals { + uid_t key; + avl_node_t avl; + uint32_t mask; + uint32_t allowed; + uint32_t denied; + int aent_type; +} acevals_t; + +typedef struct ace_list { + acevals_t user_obj; + avl_tree_t user; + int numusers; + acevals_t group_obj; + avl_tree_t group; + int numgroups; + acevals_t other_obj; + uint32_t acl_mask; + int hasmask; + int dfacl_flag; + ace_to_aent_state_t state; + int seen; /* bitmask of all aclent_t a_type values seen */ +} ace_list_t; + +/* + * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. + * v = Ptr to array/vector of objs + * n = # objs in the array + * s = size of each obj (must be multiples of a word size) + * f = ptr to function to compare two objs + * returns (-1 = less than, 0 = equal, 1 = greater than + */ +void +ksort(caddr_t v, int n, int s, int (*f)()) +{ + int g, i, j, ii; + unsigned int *p1, *p2; + unsigned int tmp; + + /* No work to do */ + if (v == NULL || n <= 1) + return; + + /* Sanity check on arguments */ + ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0); + ASSERT(s > 0); + for (g = n / 2; g > 0; g /= 2) { + for (i = g; i < n; i++) { + for (j = i - g; j >= 0 && + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { + p1 = (void *)(v + j * s); + p2 = (void *)(v + (j + g) * s); + for (ii = 0; ii < s / 4; ii++) { + tmp = *p1; + *p1++ = *p2; + *p2++ = tmp; + } + } + } + } +} + +/* + * Compare two acls, all fields. Returns: + * -1 (less than) + * 0 (equal) + * +1 (greater than) + */ +int +cmp2acls(void *a, void *b) +{ + aclent_t *x = (aclent_t *)a; + aclent_t *y = (aclent_t *)b; + + /* Compare types */ + if (x->a_type < y->a_type) + return (-1); + if (x->a_type > y->a_type) + return (1); + /* Equal types; compare id's */ + if (x->a_id < y->a_id) + return (-1); + if (x->a_id > y->a_id) + return (1); + /* Equal ids; compare perms */ + if (x->a_perm < y->a_perm) + return (-1); + if (x->a_perm > y->a_perm) + return (1); + /* Totally equal */ + return (0); +} + +static int +cacl_malloc(void **ptr, size_t size) +{ + *ptr = kmem_zalloc(size, KM_SLEEP); + return (0); +} + + +#if !defined(_KERNEL) +acl_t * +acl_alloc(enum acl_type type) +{ + acl_t *aclp; + + if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0) + return (NULL); + + aclp->acl_aclp = NULL; + aclp->acl_cnt = 0; + + switch (type) { + case ACE_T: + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + break; + case ACLENT_T: + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + break; + default: + acl_free(aclp); + aclp = NULL; + } + return (aclp); +} + +/* + * Free acl_t structure + */ +void +acl_free(acl_t *aclp) +{ + int acl_size; + + if (aclp == NULL) + return; + + if (aclp->acl_aclp) { + acl_size = aclp->acl_cnt * aclp->acl_entry_size; + cacl_free(aclp->acl_aclp, acl_size); + } + + cacl_free(aclp, sizeof (acl_t)); +} + +static uint32_t +access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) +{ + uint32_t access_mask = 0; + int acl_produce; + int synchronize_set = 0, write_owner_set = 0; + int delete_set = 0, write_attrs_set = 0; + int read_named_set = 0, write_named_set = 0; + + acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_WRITER_SET_DENY); + + if (isallow) { + synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW; + write_owner_set = ACL_WRITE_OWNER_SET_ALLOW; + delete_set = ACL_DELETE_SET_ALLOW; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_ALLOW; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + } else { + + synchronize_set = ACL_SYNCHRONIZE_SET_DENY; + write_owner_set = ACL_WRITE_OWNER_SET_DENY; + delete_set = ACL_DELETE_SET_DENY; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_DENY; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY; + else + /* + * If the entity is not the owner and does not + * have write permissions ACE_WRITE_ATTRIBUTES will + * always go in the DENY ACE. + */ + access_mask |= ACE_WRITE_ATTRIBUTES; + } + + if (acl_produce & synchronize_set) + access_mask |= ACE_SYNCHRONIZE; + if (acl_produce & write_owner_set) + access_mask |= ACE_WRITE_OWNER; + if (acl_produce & delete_set) + access_mask |= ACE_DELETE; + if (acl_produce & write_attrs_set) + access_mask |= ACE_WRITE_ATTRIBUTES; + if (acl_produce & read_named_set) + access_mask |= ACE_READ_NAMED_ATTRS; + if (acl_produce & write_named_set) + access_mask |= ACE_WRITE_NAMED_ATTRS; + + return (access_mask); +} + +/* + * Given an mode_t, convert it into an access_mask as used + * by nfsace, assuming aclent_t -> nfsace semantics. + */ +static uint32_t +mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) +{ + uint32_t access = 0; + int haswriteperm = 0; + int hasreadperm = 0; + + if (isallow) { + haswriteperm = (mode & S_IWOTH); + hasreadperm = (mode & S_IROTH); + } else { + haswriteperm = !(mode & S_IWOTH); + hasreadperm = !(mode & S_IROTH); + } + + /* + * The following call takes care of correctly setting the following + * mask bits in the access_mask: + * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE, + * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS + */ + access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow); + + if (isallow) { + access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES; + if (isowner) + access |= ACE_WRITE_ACL; + } else { + if (! isowner) + access |= ACE_WRITE_ACL; + } + + /* read */ + if (mode & S_IROTH) { + access |= ACE_READ_DATA; + } + /* write */ + if (mode & S_IWOTH) { + access |= ACE_WRITE_DATA | + ACE_APPEND_DATA; + if (isdir) + access |= ACE_DELETE_CHILD; + } + /* exec */ + if (mode & S_IXOTH) { + access |= ACE_EXECUTE; + } + + return (access); +} + +/* + * Given an nfsace (presumably an ALLOW entry), make a + * corresponding DENY entry at the address given. + */ +static void +ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner) +{ + (void) memcpy(deny, allow, sizeof (ace_t)); + + deny->a_who = allow->a_who; + + deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS; + if (isdir) + deny->a_access_mask ^= ACE_DELETE_CHILD; + + deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER | + ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS | + ACE_WRITE_NAMED_ATTRS); + deny->a_access_mask |= access_mask_set((allow->a_access_mask & + ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner, + B_FALSE); +} +/* + * Make an initial pass over an array of aclent_t's. Gather + * information such as an ACL_MASK (if any), number of users, + * number of groups, and whether the array needs to be sorted. + */ +static int +ln_aent_preprocess(aclent_t *aclent, int n, + int *hasmask, mode_t *mask, + int *numuser, int *numgroup, int *needsort) +{ + int error = 0; + int i; + int curtype = 0; + + *hasmask = 0; + *mask = 07; + *needsort = 0; + *numuser = 0; + *numgroup = 0; + + for (i = 0; i < n; i++) { + if (aclent[i].a_type < curtype) + *needsort = 1; + else if (aclent[i].a_type > curtype) + curtype = aclent[i].a_type; + if (aclent[i].a_type & USER) + (*numuser)++; + if (aclent[i].a_type & (GROUP | GROUP_OBJ)) + (*numgroup)++; + if (aclent[i].a_type & CLASS_OBJ) { + if (*hasmask) { + error = EINVAL; + goto out; + } else { + *hasmask = 1; + *mask = aclent[i].a_perm; + } + } + } + + if ((! *hasmask) && (*numuser + *numgroup > 1)) { + error = EINVAL; + goto out; + } + +out: + return (error); +} + +/* + * Convert an array of aclent_t into an array of nfsace entries, + * following POSIX draft -> nfsv4 conversion semantics as outlined in + * the IETF draft. + */ +static int +ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir) +{ + int error = 0; + mode_t mask; + int numuser, numgroup, needsort; + int resultsize = 0; + int i, groupi = 0, skip; + ace_t *acep, *result = NULL; + int hasmask; + + error = ln_aent_preprocess(aclent, n, &hasmask, &mask, + &numuser, &numgroup, &needsort); + if (error != 0) + goto out; + + /* allow + deny for each aclent */ + resultsize = n * 2; + if (hasmask) { + /* + * stick extra deny on the group_obj and on each + * user|group for the mask (the group_obj was added + * into the count for numgroup) + */ + resultsize += numuser + numgroup; + /* ... and don't count the mask itself */ + resultsize -= 2; + } + + /* sort the source if necessary */ + if (needsort) + ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls); + + if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0) + goto out; + + acep = result; + + for (i = 0; i < n; i++) { + /* + * don't process CLASS_OBJ (mask); mask was grabbed in + * ln_aent_preprocess() + */ + if (aclent[i].a_type & CLASS_OBJ) + continue; + + /* If we need an ACL_MASK emulator, prepend it now */ + if ((hasmask) && + (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) { + acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + acep->a_flags = 0; + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= + (ACE_IDENTIFIER_GROUP|ACE_GROUP); + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + } else { + acep->a_who = aclent[i].a_id; + acep->a_flags |= ACE_IDENTIFIER_GROUP; + } + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + /* + * Set the access mask for the prepended deny + * ace. To do this, we invert the mask (found + * in ln_aent_preprocess()) then convert it to an + * DENY ace access_mask. + */ + acep->a_access_mask = mode_to_ace_access((mask ^ 07), + isdir, 0, 0); + acep += 1; + } + + /* handle a_perm -> access_mask */ + acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm, + isdir, aclent[i].a_type & USER_OBJ, 1); + + /* emulate a default aclent */ + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + + /* + * handle a_perm and a_id + * + * this must be done last, since it involves the + * corresponding deny aces, which are handled + * differently for each different a_type. + */ + if (aclent[i].a_type & USER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_OWNER; + ace_make_deny(acep, acep + 1, isdir, B_TRUE); + acep += 2; + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) { + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_GROUP; + } else { + acep->a_who = aclent[i].a_id; + } + acep->a_flags |= ACE_IDENTIFIER_GROUP; + /* + * Set the corresponding deny for the group ace. + * + * The deny aces go after all of the groups, unlike + * everything else, where they immediately follow + * the allow ace. + * + * We calculate "skip", the number of slots to + * skip ahead for the deny ace, here. + * + * The pattern is: + * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3 + * thus, skip is + * (2 * numgroup) - 1 - groupi + * (2 * numgroup) to account for MD + A + * - 1 to account for the fact that we're on the + * access (A), not the mask (MD) + * - groupi to account for the fact that we have + * passed up groupi number of MD's. + */ + skip = (2 * numgroup) - 1 - groupi; + ace_make_deny(acep, acep + skip, isdir, B_FALSE); + /* + * If we just did the last group, skip acep past + * all of the denies; else, just move ahead one. + */ + if (++groupi >= numgroup) + acep += numgroup + 1; + else + acep += 1; + } else if (aclent[i].a_type & OTHER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_EVERYONE; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else { + error = EINVAL; + goto out; + } + } + + *acepp = result; + *rescount = resultsize; + +out: + if (error != 0) { + if ((result != NULL) && (resultsize > 0)) { + cacl_free(result, resultsize * sizeof (ace_t)); + } + } + + return (error); +} + +static int +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, + ace_t **retacep, int *retacecnt) +{ + ace_t *acep; + ace_t *dfacep; + int acecnt = 0; + int dfacecnt = 0; + int dfaclstart = 0; + int dfaclcnt = 0; + aclent_t *aclp; + int i; + int error; + int acesz, dfacesz; + + ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls); + + for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) { + if (aclp->a_type & ACL_DEFAULT) + break; + } + + if (i < aclcnt) { + dfaclstart = i; + dfaclcnt = aclcnt - i; + } + + if (dfaclcnt && !isdir) { + return (EINVAL); + } + + error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir); + if (error) + return (error); + + if (dfaclcnt) { + error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt, + &dfacep, &dfacecnt, isdir); + if (error) { + if (acep) { + cacl_free(acep, acecnt * sizeof (ace_t)); + } + return (error); + } + } + + if (dfacecnt != 0) { + acesz = sizeof (ace_t) * acecnt; + dfacesz = sizeof (ace_t) * dfacecnt; + acep = cacl_realloc(acep, acesz, acesz + dfacesz); + if (acep == NULL) + return (ENOMEM); + if (dfaclcnt) { + (void) memcpy(acep + acecnt, dfacep, dfacesz); + } + } + if (dfaclcnt) + cacl_free(dfacep, dfacecnt * sizeof (ace_t)); + + *retacecnt = acecnt + dfacecnt; + *retacep = acep; + return (0); +} + +static int +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + int error = 0; + o_mode_t mode = 0; + uint32_t bits, wantbits; + + /* read */ + if (mask & ACE_READ_DATA) + mode |= S_IROTH; + + /* write */ + wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA); + if (isdir) + wantbits |= ACE_DELETE_CHILD; + bits = mask & wantbits; + if (bits != 0) { + if (bits != wantbits) { + error = ENOTSUP; + goto out; + } + mode |= S_IWOTH; + } + + /* exec */ + if (mask & ACE_EXECUTE) { + mode |= S_IXOTH; + } + + *modep = mode; + +out: + return (error); +} + +static void +acevals_init(acevals_t *vals, uid_t key) +{ + bzero(vals, sizeof (*vals)); + vals->allowed = ACE_MASK_UNDEFINED; + vals->denied = ACE_MASK_UNDEFINED; + vals->mask = ACE_MASK_UNDEFINED; + vals->key = key; +} + +static void +ace_list_init(ace_list_t *al, int dfacl_flag) +{ + acevals_init(&al->user_obj, 0); + acevals_init(&al->group_obj, 0); + acevals_init(&al->other_obj, 0); + al->numusers = 0; + al->numgroups = 0; + al->acl_mask = 0; + al->hasmask = 0; + al->state = ace_unused; + al->seen = 0; + al->dfacl_flag = dfacl_flag; +} + +/* + * Find or create an acevals holder for a given id and avl tree. + * + * Note that only one thread will ever touch these avl trees, so + * there is no need for locking. + */ +static acevals_t * +acevals_find(ace_t *ace, avl_tree_t *avl, int *num) +{ + acevals_t key, *rc; + avl_index_t where; + + key.key = ace->a_who; + rc = avl_find(avl, &key, &where); + if (rc != NULL) + return (rc); + + /* this memory is freed by ln_ace_to_aent()->ace_list_free() */ + if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0) + return (NULL); + + acevals_init(rc, ace->a_who); + avl_insert(avl, rc, where); + (*num)++; + + return (rc); +} + +static int +access_mask_check(ace_t *acep, int mask_bit, int isowner) +{ + int set_deny, err_deny; + int set_allow, err_allow; + int acl_consume; + int haswriteperm, hasreadperm; + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1; + } else { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0; + } + + acl_consume = (ACL_SYNCHRONIZE_ERR_DENY | + ACL_DELETE_ERR_DENY | + ACL_WRITE_OWNER_ERR_DENY | + ACL_WRITE_OWNER_ERR_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_ERR_DENY | + ACL_WRITE_ATTRS_WRITER_SET_DENY | + ACL_WRITE_ATTRS_WRITER_ERR_ALLOW | + ACL_WRITE_NAMED_WRITER_ERR_DENY | + ACL_READ_NAMED_READER_ERR_DENY); + + if (mask_bit == ACE_SYNCHRONIZE) { + set_deny = ACL_SYNCHRONIZE_SET_DENY; + err_deny = ACL_SYNCHRONIZE_ERR_DENY; + set_allow = ACL_SYNCHRONIZE_SET_ALLOW; + err_allow = ACL_SYNCHRONIZE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_OWNER) { + set_deny = ACL_WRITE_OWNER_SET_DENY; + err_deny = ACL_WRITE_OWNER_ERR_DENY; + set_allow = ACL_WRITE_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_OWNER_ERR_ALLOW; + } else if (mask_bit == ACE_DELETE) { + set_deny = ACL_DELETE_SET_DENY; + err_deny = ACL_DELETE_ERR_DENY; + set_allow = ACL_DELETE_SET_ALLOW; + err_allow = ACL_DELETE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_ATTRIBUTES) { + if (isowner) { + set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW; + } else if (haswriteperm) { + set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW; + } else { + if ((acep->a_access_mask & mask_bit) && + (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) { + return (ENOTSUP); + } + return (0); + } + } else if (mask_bit == ACE_READ_NAMED_ATTRS) { + if (!hasreadperm) + return (0); + + set_deny = ACL_READ_NAMED_READER_SET_DENY; + err_deny = ACL_READ_NAMED_READER_ERR_DENY; + set_allow = ACL_READ_NAMED_READER_SET_ALLOW; + err_allow = ACL_READ_NAMED_READER_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) { + if (!haswriteperm) + return (0); + + set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY; + err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY; + set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW; + } else { + return (EINVAL); + } + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + if (acl_consume & set_deny) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_deny) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } else { + /* ACE_ACCESS_ALLOWED_ACE_TYPE */ + if (acl_consume & set_allow) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_allow) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } + return (0); +} + +static int +ace_to_aent_legal(ace_t *acep) +{ + int error = 0; + int isowner; + + /* only ALLOW or DENY */ + if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) && + (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid flags */ + if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) { + error = EINVAL; + goto out; + } + + /* some flags are illegal */ + if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG | + ACE_FAILED_ACCESS_ACE_FLAG | + ACE_NO_PROPAGATE_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid masks */ + if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) { + error = EINVAL; + goto out; + } + + if ((acep->a_flags & ACE_OWNER)) { + isowner = 1; + } else { + isowner = 0; + } + + error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_OWNER, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_DELETE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner); + if (error) + goto out; + + /* more detailed checking of masks */ + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (! (acep->a_access_mask & ACE_APPEND_DATA))) { + error = ENOTSUP; + goto out; + } + if ((! (acep->a_access_mask & ACE_WRITE_DATA)) && + (acep->a_access_mask & ACE_APPEND_DATA)) { + error = ENOTSUP; + goto out; + } + } + + /* ACL enforcement */ + if ((acep->a_access_mask & ACE_READ_ACL) && + (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + if (acep->a_access_mask & ACE_WRITE_ACL) { + if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) && + (isowner)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) && + (! isowner)) { + error = ENOTSUP; + goto out; + } + } + +out: + return (error); +} + +static int +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ + if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != + (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) { + return (ENOTSUP); + } + + return (ace_mask_to_mode(mask, modep, isdir)); +} + +static int +acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error; + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + + if (isdir) + flips |= ACE_DELETE_CHILD; + if (vals->allowed != (vals->denied ^ flips)) { + error = ENOTSUP; + goto out; + } + if ((list->hasmask) && (list->acl_mask != vals->mask) && + (vals->aent_type & (USER | GROUP | GROUP_OBJ))) { + error = ENOTSUP; + goto out; + } + error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir); + if (error != 0) + goto out; + dest->a_type = vals->aent_type; + if (dest->a_type & (USER | GROUP)) { + dest->a_id = vals->key; + } else if (dest->a_type & USER_OBJ) { + dest->a_id = owner; + } else if (dest->a_type & GROUP_OBJ) { + dest->a_id = group; + } else if (dest->a_type & OTHER_OBJ) { + dest->a_id = 0; + } else { + error = EINVAL; + goto out; + } + +out: + return (error); +} + + +static int +ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error = 0; + aclent_t *aent, *result = NULL; + acevals_t *vals; + int resultcount; + + if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) != + (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) { + error = ENOTSUP; + goto out; + } + if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) { + error = ENOTSUP; + goto out; + } + + resultcount = 3 + list->numusers + list->numgroups; + /* + * This must be the same condition as below, when we add the CLASS_OBJ + * (aka ACL mask) + */ + if ((list->hasmask) || (! list->dfacl_flag)) + resultcount += 1; + + if (cacl_malloc((void **)&result, + resultcount * sizeof (aclent_t)) != 0) { + error = ENOMEM; + goto out; + } + aent = result; + + /* USER_OBJ */ + if (!(list->user_obj.aent_type & USER_OBJ)) { + error = EINVAL; + goto out; + } + + error = acevals_to_aent(&list->user_obj, aent, list, owner, group, + isdir); + + if (error != 0) + goto out; + ++aent; + /* USER */ + vals = NULL; + for (vals = avl_first(&list->user); vals != NULL; + vals = AVL_NEXT(&list->user, vals)) { + if (!(vals->aent_type & USER)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* GROUP_OBJ */ + if (!(list->group_obj.aent_type & GROUP_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->group_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + /* GROUP */ + vals = NULL; + for (vals = avl_first(&list->group); vals != NULL; + vals = AVL_NEXT(&list->group, vals)) { + if (!(vals->aent_type & GROUP)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* + * CLASS_OBJ (aka ACL_MASK) + * + * An ACL_MASK is not fabricated if the ACL is a default ACL. + * This is to follow UFS's behavior. + */ + if ((list->hasmask) || (! list->dfacl_flag)) { + if (list->hasmask) { + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + if (isdir) + flips |= ACE_DELETE_CHILD; + error = ace_mask_to_mode(list->acl_mask ^ flips, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } else { + /* fabricate the ACL_MASK from the group permissions */ + error = ace_mask_to_mode(list->group_obj.allowed, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } + aent->a_id = 0; + aent->a_type = CLASS_OBJ | list->dfacl_flag; + ++aent; + } + /* OTHER_OBJ */ + if (!(list->other_obj.aent_type & OTHER_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->other_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + + *aclentp = result; + *aclcnt = resultcount; + +out: + if (error != 0) { + if (result != NULL) + cacl_free(result, resultcount * sizeof (aclent_t)); + } + + return (error); +} + + +/* + * free all data associated with an ace_list + */ +static void +ace_list_free(ace_list_t *al) +{ + acevals_t *node; + void *cookie; + + if (al == NULL) + return; + + cookie = NULL; + while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + cookie = NULL; + while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + + avl_destroy(&al->user); + avl_destroy(&al->group); + + /* free the container itself */ + cacl_free(al, sizeof (ace_list_t)); +} + +static int +acevals_compare(const void *va, const void *vb) +{ + const acevals_t *a = va, *b = vb; + + if (a->key == b->key) + return (0); + + if (a->key > b->key) + return (1); + + else + return (-1); +} + +/* + * Convert a list of ace_t entries to equivalent regular and default + * aclent_t lists. Return error (ENOTSUP) when conversion is not possible. + */ +static int +ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, + aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, + boolean_t isdir) +{ + int error = 0; + ace_t *acep; + uint32_t bits; + int i; + ace_list_t *normacl = NULL, *dfacl = NULL, *acl; + acevals_t *vals; + + *aclentp = NULL; + *aclcnt = 0; + *dfaclentp = NULL; + *dfaclcnt = 0; + + /* we need at least user_obj, group_obj, and other_obj */ + if (n < 6) { + error = ENOTSUP; + goto out; + } + if (ace == NULL) { + error = EINVAL; + goto out; + } + + error = cacl_malloc((void **)&normacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&normacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&normacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + + ace_list_init(normacl, 0); + + error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + ace_list_init(dfacl, ACL_DEFAULT); + + /* process every ace_t... */ + for (i = 0; i < n; i++) { + acep = &ace[i]; + + /* rule out certain cases quickly */ + error = ace_to_aent_legal(acep); + if (error != 0) + goto out; + + /* + * Turn off these bits in order to not have to worry about + * them when doing the checks for compliments. + */ + acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE | + ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS); + + /* see if this should be a regular or default acl */ + bits = acep->a_flags & + (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE); + if (bits != 0) { + /* all or nothing on these inherit bits */ + if (bits != (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + acl = dfacl; + } else { + acl = normacl; + } + + if ((acep->a_flags & ACE_OWNER)) { + if (acl->state > ace_user_obj) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user_obj; + acl->seen |= USER_OBJ; + vals = &acl->user_obj; + vals->aent_type = USER_OBJ | acl->dfacl_flag; + } else if ((acep->a_flags & ACE_EVERYONE)) { + acl->state = ace_other_obj; + acl->seen |= OTHER_OBJ; + vals = &acl->other_obj; + vals->aent_type = OTHER_OBJ | acl->dfacl_flag; + } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) { + if (acl->state > ace_group) { + error = ENOTSUP; + goto out; + } + if ((acep->a_flags & ACE_GROUP)) { + acl->seen |= GROUP_OBJ; + vals = &acl->group_obj; + vals->aent_type = GROUP_OBJ | acl->dfacl_flag; + } else { + acl->seen |= GROUP; + vals = acevals_find(acep, &acl->group, + &acl->numgroups); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = GROUP | acl->dfacl_flag; + } + acl->state = ace_group; + } else { + if (acl->state > ace_user) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user; + acl->seen |= USER; + vals = acevals_find(acep, &acl->user, + &acl->numusers); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = USER | acl->dfacl_flag; + } + + if (!(acl->state > ace_unused)) { + error = EINVAL; + goto out; + } + + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + /* no more than one allowed per aclent_t */ + if (vals->allowed != ACE_MASK_UNDEFINED) { + error = ENOTSUP; + goto out; + } + vals->allowed = acep->a_access_mask; + } else { + /* + * it's a DENY; if there was a previous DENY, it + * must have been an ACL_MASK. + */ + if (vals->denied != ACE_MASK_UNDEFINED) { + /* ACL_MASK is for USER and GROUP only */ + if ((acl->state != ace_user) && + (acl->state != ace_group)) { + error = ENOTSUP; + goto out; + } + + if (! acl->hasmask) { + acl->hasmask = 1; + acl->acl_mask = vals->denied; + /* check for mismatched ACL_MASK emulations */ + } else if (acl->acl_mask != vals->denied) { + error = ENOTSUP; + goto out; + } + vals->mask = vals->denied; + } + vals->denied = acep->a_access_mask; + } + } + + /* done collating; produce the aclent_t lists */ + if (normacl->state != ace_unused) { + error = ace_list_to_aent(normacl, aclentp, aclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + if (dfacl->state != ace_unused) { + error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + +out: + if (normacl != NULL) + ace_list_free(normacl); + if (dfacl != NULL) + ace_list_free(dfacl); + + return (error); +} + +static int +convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, + uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) +{ + int error = 0; + aclent_t *aclentp, *dfaclentp; + int aclcnt, dfaclcnt; + int aclsz, dfaclsz; + + error = ln_ace_to_aent(acebufp, acecnt, owner, group, + &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir); + + if (error) + return (error); + + + if (dfaclcnt != 0) { + /* + * Slap aclentp and dfaclentp into a single array. + */ + aclsz = sizeof (aclent_t) * aclcnt; + dfaclsz = sizeof (aclent_t) * dfaclcnt; + aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz); + if (aclentp != NULL) { + (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz); + } else { + error = ENOMEM; + } + } + + if (aclentp) { + *retaclentp = aclentp; + *retaclcnt = aclcnt + dfaclcnt; + } + + if (dfaclentp) + cacl_free(dfaclentp, dfaclsz); + + return (error); +} + + +int +acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, + gid_t group) +{ + int aclcnt; + void *acldata; + int error; + + /* + * See if we need to translate + */ + if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) || + (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACLENT_T)) + return (0); + + if (target_flavor == -1) { + error = EINVAL; + goto out; + } + + if (target_flavor == _ACL_ACE_ENABLED && + aclp->acl_type == ACLENT_T) { + error = convert_aent_to_ace(aclp->acl_aclp, + aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt); + if (error) + goto out; + + } else if (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACE_T) { + error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt, + isdir, owner, group, (aclent_t **)&acldata, &aclcnt); + if (error) + goto out; + } else { + error = ENOTSUP; + goto out; + } + + /* + * replace old acl with newly translated acl + */ + cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size); + aclp->acl_aclp = acldata; + aclp->acl_cnt = aclcnt; + if (target_flavor == _ACL_ACE_ENABLED) { + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + } else { + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + } + return (0); + +out: + +#if !defined(_KERNEL) + errno = error; + return (-1); +#else + return (error); +#endif +} +#endif /* !_KERNEL */ + +#define SET_ACE(acl, index, who, mask, type, flags) { \ + acl[0][index].a_who = (uint32_t)who; \ + acl[0][index].a_type = type; \ + acl[0][index].a_flags = flags; \ + acl[0][index++].a_access_mask = mask; \ +} + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + (void) isdir; /* will need this later */ + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +int +acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) +{ + int index = 0; + int error; + trivial_acl_t masks; + + *count = 3; + acl_trivial_access_masks(mode, isdir, &masks); + + if (masks.allow0) + (*count)++; + if (masks.deny1) + (*count)++; + if (masks.deny2) + (*count)++; + + if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) + return (error); + + if (masks.allow0) { + SET_ACE(acl, index, -1, masks.allow0, + ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny1) { + SET_ACE(acl, index, -1, masks.deny1, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny2) { + SET_ACE(acl, index, -1, masks.deny2, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); + } + + SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_OWNER); + SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_IDENTIFIER_GROUP|ACE_GROUP); + SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_EVERYONE); + + return (0); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permissions are never set by default + */ + if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + return (0); +} + +uint64_t +ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, + uint16_t *type, uint32_t *mask) +{ + ace_t *acep = datap; + + if (cookie >= aclcnt) + return (0); + + *flags = acep[cookie].a_flags; + *type = acep[cookie].a_type; + *mask = acep[cookie++].a_access_mask; + + return (cookie); +} + +int +ace_trivial(ace_t *acep, int aclcnt) +{ + return (ace_trivial_common(acep, aclcnt, ace_walk)); +} diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c new file mode 100644 index 000000000..d4a0e141c --- /dev/null +++ b/module/os/freebsd/spl/callb.c @@ -0,0 +1,372 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/mutex.h> +#include <sys/condvar.h> +#include <sys/callb.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/kobj.h> +#include <sys/systm.h> /* for delay() */ +#include <sys/taskq.h> /* For TASKQ_NAMELEN */ +#include <sys/kernel.h> + +#define CB_MAXNAME TASKQ_NAMELEN + +/* + * The callb mechanism provides generic event scheduling/echoing. + * A callb function is registered and called on behalf of the event. + */ +typedef struct callb { + struct callb *c_next; /* next in class or on freelist */ + kthread_id_t c_thread; /* ptr to caller's thread struct */ + char c_flag; /* info about the callb state */ + uchar_t c_class; /* this callb's class */ + kcondvar_t c_done_cv; /* signal callb completion */ + boolean_t (*c_func)(); /* cb function: returns true if ok */ + void *c_arg; /* arg to c_func */ + char c_name[CB_MAXNAME+1]; /* debug:max func name length */ +} callb_t; + +/* + * callb c_flag bitmap definitions + */ +#define CALLB_FREE 0x0 +#define CALLB_TAKEN 0x1 +#define CALLB_EXECUTING 0x2 + +/* + * Basic structure for a callb table. + * All callbs are organized into different class groups described + * by ct_class array. + * The callbs within a class are single-linked and normally run by a + * serial execution. + */ +typedef struct callb_table { + kmutex_t ct_lock; /* protect all callb states */ + callb_t *ct_freelist; /* free callb structures */ + int ct_busy; /* != 0 prevents additions */ + kcondvar_t ct_busy_cv; /* to wait for not busy */ + int ct_ncallb; /* num of callbs allocated */ + callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ +} callb_table_t; + +int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; + +static callb_id_t callb_add_common(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); + +static callb_table_t callb_table; /* system level callback table */ +static callb_table_t *ct = &callb_table; +static kmutex_t callb_safe_mutex; +callb_cpr_t callb_cprinfo_safe = { + &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} }; + +/* + * Init all callb tables in the system. + */ +void +callb_init(void *dummy __unused) +{ + callb_table.ct_busy = 0; /* mark table open for additions */ + mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +callb_fini(void *dummy __unused) +{ + callb_t *cp; + int i; + + mutex_enter(&ct->ct_lock); + for (i = 0; i < 16; i++) { + while ((cp = ct->ct_freelist) != NULL) { + ct->ct_freelist = cp->c_next; + ct->ct_ncallb--; + kmem_free(cp, sizeof (callb_t)); + } + if (ct->ct_ncallb == 0) + break; + /* Not all callbacks finished, waiting for the rest. */ + mutex_exit(&ct->ct_lock); + tsleep(ct, 0, "callb", hz / 4); + mutex_enter(&ct->ct_lock); + } + if (ct->ct_ncallb > 0) + printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); + mutex_exit(&ct->ct_lock); + mutex_destroy(&callb_safe_mutex); + mutex_destroy(&callb_table.ct_lock); +} + +/* + * callout_add() is called to register func() be called later. + */ +static callb_id_t +callb_add_common(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + callb_t *cp; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + while (ct->ct_busy) + cv_wait(&ct->ct_busy_cv, &ct->ct_lock); + if ((cp = ct->ct_freelist) == NULL) { + ct->ct_ncallb++; + cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + } + ct->ct_freelist = cp->c_next; + cp->c_thread = t; + cp->c_func = func; + cp->c_arg = arg; + cp->c_class = (uchar_t)class; + cp->c_flag |= CALLB_TAKEN; +#ifdef DEBUG + if (strlen(name) > CB_MAXNAME) + cmn_err(CE_WARN, "callb_add: name of callback function '%s' " + "too long -- truncated to %d chars", + name, CB_MAXNAME); +#endif + (void) strncpy(cp->c_name, name, CB_MAXNAME); + cp->c_name[CB_MAXNAME] = '\0'; + + /* + * Insert the new callb at the head of its class list. + */ + cp->c_next = ct->ct_first_cb[class]; + ct->ct_first_cb[class] = cp; + + mutex_exit(&ct->ct_lock); + return ((callb_id_t)cp); +} + +/* + * The default function to add an entry to the callback table. Since + * it uses curthread as the thread identifier to store in the table, + * it should be used for the normal case of a thread which is calling + * to add ITSELF to the table. + */ +callb_id_t +callb_add(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name) +{ + return (callb_add_common(func, arg, class, name, curthread)); +} + +/* + * A special version of callb_add() above for use by threads which + * might be adding an entry to the table on behalf of some other + * thread (for example, one which is constructed but not yet running). + * In this version the thread id is an argument. + */ +callb_id_t +callb_add_thread(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + return (callb_add_common(func, arg, class, name, t)); +} + +/* + * callout_delete() is called to remove an entry identified by id + * that was originally placed there by a call to callout_add(). + * return -1 if fail to delete a callb entry otherwise return 0. + */ +int +callb_delete(callb_id_t id) +{ + callb_t **pp; + callb_t *me = (callb_t *)id; + + mutex_enter(&ct->ct_lock); + + for (;;) { + pp = &ct->ct_first_cb[me->c_class]; + while (*pp != NULL && *pp != me) + pp = &(*pp)->c_next; + +#ifdef DEBUG + if (*pp != me) { + cmn_err(CE_WARN, "callb delete bogus entry 0x%p", + (void *)me); + mutex_exit(&ct->ct_lock); + return (-1); + } +#endif /* DEBUG */ + + /* + * It is not allowed to delete a callb in the middle of + * executing otherwise, the callb_execute() will be confused. + */ + if (!(me->c_flag & CALLB_EXECUTING)) + break; + + cv_wait(&me->c_done_cv, &ct->ct_lock); + } + /* relink the class list */ + *pp = me->c_next; + + /* clean up myself and return the free callb to the head of freelist */ + me->c_flag = CALLB_FREE; + me->c_next = ct->ct_freelist; + ct->ct_freelist = me; + + mutex_exit(&ct->ct_lock); + return (0); +} + +/* + * class: indicates to execute all callbs in the same class; + * code: optional argument for the callb functions. + * return: = 0: success + * != 0: ptr to string supplied when callback was registered + */ +void * +callb_execute_class(int class, int code) +{ + callb_t *cp; + void *ret = NULL; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[class]; + cp != NULL && ret == 0; cp = cp->c_next) { + while (cp->c_flag & CALLB_EXECUTING) + cv_wait(&cp->c_done_cv, &ct->ct_lock); + /* + * cont if the callb is deleted while we're sleeping + */ + if (cp->c_flag == CALLB_FREE) + continue; + cp->c_flag |= CALLB_EXECUTING; + +#ifdef CALLB_DEBUG + printf("callb_execute: name=%s func=%p arg=%p\n", + cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); +#endif /* CALLB_DEBUG */ + + mutex_exit(&ct->ct_lock); + /* If callback function fails, pass back client's name */ + if (!(*cp->c_func)(cp->c_arg, code)) + ret = cp->c_name; + mutex_enter(&ct->ct_lock); + + cp->c_flag &= ~CALLB_EXECUTING; + cv_broadcast(&cp->c_done_cv); + } + mutex_exit(&ct->ct_lock); + return (ret); +} + +/* + * callers make sure no recursive entries to this func. + * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. + * + * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we + * use a cv_timedwait() in case the kernel thread is blocked. + * + * Note that this is a generic callback handler for daemon CPR and + * should NOT be changed to accommodate any specific requirement in a daemon. + * Individual daemons that require changes to the handler shall write + * callback routines in their own daemon modules. + */ +boolean_t +callb_generic_cpr(void *arg, int code) +{ + callb_cpr_t *cp = (callb_cpr_t *)arg; + clock_t ret = 0; /* assume success */ + + mutex_enter(cp->cc_lockp); + + switch (code) { + case CB_CODE_CPR_CHKPT: + cp->cc_events |= CALLB_CPR_START; +#ifdef CPR_NOT_THREAD_SAFE + while (!(cp->cc_events & CALLB_CPR_SAFE)) + /* cv_timedwait() returns -1 if it times out. */ + if ((ret = cv_reltimedwait(&cp->cc_callb_cv, + cp->cc_lockp, (callb_timeout_sec * hz), + TR_CLOCK_TICK)) == -1) + break; +#endif + break; + + case CB_CODE_CPR_RESUME: + cp->cc_events &= ~CALLB_CPR_START; + cv_signal(&cp->cc_stop_cv); + break; + } + mutex_exit(cp->cc_lockp); + return (ret != -1); +} + +/* + * The generic callback function associated with kernel threads which + * are always considered safe. + */ +/* ARGSUSED */ +boolean_t +callb_generic_cpr_safe(void *arg, int code) +{ + return (B_TRUE); +} +/* + * Prevent additions to callback table. + */ +void +callb_lock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy == 0); + ct->ct_busy = 1; + mutex_exit(&ct->ct_lock); +} + +/* + * Allow additions to callback table. + */ +void +callb_unlock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy != 0); + ct->ct_busy = 0; + cv_broadcast(&ct->ct_busy_cv); + mutex_exit(&ct->ct_lock); +} + +SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); +SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/module/os/freebsd/spl/list.c b/module/os/freebsd/spl/list.c new file mode 100644 index 000000000..e8db13a5c --- /dev/null +++ b/module/os/freebsd/spl/list.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Generic doubly-linked list implementation + */ + +#include <sys/list.h> +#include <sys/list_impl.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/freebsd/spl/sha224.h b/module/os/freebsd/spl/sha224.h new file mode 100644 index 000000000..0abd43068 --- /dev/null +++ b/module/os/freebsd/spl/sha224.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA224_H_ +#define _SHA224_H_ + +#ifndef _KERNEL +#include <sys/types.h> +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA224_DIGEST_LENGTH 28 +#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA224Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA224_BLOCK_LENGTH]; +} SHA224_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA224_Init +#define SHA224_Init _libmd_SHA224_Init +#endif +#ifndef SHA224_Update +#define SHA224_Update _libmd_SHA224_Update +#endif +#ifndef SHA224_Final +#define SHA224_Final _libmd_SHA224_Final +#endif +#ifndef SHA224_End +#define SHA224_End _libmd_SHA224_End +#endif +#ifndef SHA224_Fd +#define SHA224_Fd _libmd_SHA224_Fd +#endif +#ifndef SHA224_FdChunk +#define SHA224_FdChunk _libmd_SHA224_FdChunk +#endif +#ifndef SHA224_File +#define SHA224_File _libmd_SHA224_File +#endif +#ifndef SHA224_FileChunk +#define SHA224_FileChunk _libmd_SHA224_FileChunk +#endif +#ifndef SHA224_Data +#define SHA224_Data _libmd_SHA224_Data +#endif + +#ifndef SHA224_version +#define SHA224_version _libmd_SHA224_version +#endif + +void SHA224_Init(SHA224_CTX *); +void SHA224_Update(SHA224_CTX *, const void *, size_t); +void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)], + SHA224_CTX *); +#ifndef _KERNEL +char *SHA224_End(SHA224_CTX *, char *); +char *SHA224_Data(const void *, unsigned int, char *); +char *SHA224_Fd(int, char *); +char *SHA224_FdChunk(int, char *, off_t, off_t); +char *SHA224_File(const char *, char *); +char *SHA224_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA224_H_ */ diff --git a/module/os/freebsd/spl/sha256.h b/module/os/freebsd/spl/sha256.h new file mode 100644 index 000000000..193c0c025 --- /dev/null +++ b/module/os/freebsd/spl/sha256.h @@ -0,0 +1,99 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#ifndef _KERNEL +#include <sys/types.h> +#endif + +#define SHA256_BLOCK_LENGTH 64 +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA256Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA256_BLOCK_LENGTH]; +} SHA256_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA256_Init +#define SHA256_Init _libmd_SHA256_Init +#endif +#ifndef SHA256_Update +#define SHA256_Update _libmd_SHA256_Update +#endif +#ifndef SHA256_Final +#define SHA256_Final _libmd_SHA256_Final +#endif +#ifndef SHA256_End +#define SHA256_End _libmd_SHA256_End +#endif +#ifndef SHA256_Fd +#define SHA256_Fd _libmd_SHA256_Fd +#endif +#ifndef SHA256_FdChunk +#define SHA256_FdChunk _libmd_SHA256_FdChunk +#endif +#ifndef SHA256_File +#define SHA256_File _libmd_SHA256_File +#endif +#ifndef SHA256_FileChunk +#define SHA256_FileChunk _libmd_SHA256_FileChunk +#endif +#ifndef SHA256_Data +#define SHA256_Data _libmd_SHA256_Data +#endif + +#ifndef SHA256_Transform +#define SHA256_Transform _libmd_SHA256_Transform +#endif +#ifndef SHA256_version +#define SHA256_version _libmd_SHA256_version +#endif + +void SHA256_Init(SHA256_CTX *); +void SHA256_Update(SHA256_CTX *, const void *, size_t); +void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)], + SHA256_CTX *); +#ifndef _KERNEL +char *SHA256_End(SHA256_CTX *, char *); +char *SHA256_Data(const void *, unsigned int, char *); +char *SHA256_Fd(int, char *); +char *SHA256_FdChunk(int, char *, off_t, off_t); +char *SHA256_File(const char *, char *); +char *SHA256_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA256_H_ */ diff --git a/module/os/freebsd/spl/sha256c.c b/module/os/freebsd/spl/sha256c.c new file mode 100644 index 000000000..241cf8c9a --- /dev/null +++ b/module/os/freebsd/spl/sha256c.c @@ -0,0 +1,378 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <string.h> +#endif + + +#include <sys/byteorder.h> +#include <sys/endian.h> +#include "sha224.h" +#include "sha256.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint32_t into a vector of bytes */ +#define be32enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint32_t */ +#define be32dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA256 round constants. */ +static const uint32_t K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t *state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be32dec_vect(W, block, 64); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count >> 3) & 0x3f; + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if (r < 56) { + /* Pad to 56 mod 64. */ + memcpy(&ctx->buf[r], PAD, 56 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 56); + } + + /* Add the terminating bit-count. */ + be64enc(&ctx->buf[56], ctx->count); + + /* Mix in the final block. */ + SHA256_Transform(ctx->state, ctx->buf); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen = len << 3; + + /* Update number of bits */ + ctx->count += bitlen; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-224: ******************************************************* */ +/* + * the SHA224 and SHA256 transforms are identical + */ + +/* SHA-224 initialization. Begins a SHA-224 operation. */ +void +SHA224_Init(SHA224_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xC1059ED8; + ctx->state[1] = 0x367CD507; + ctx->state[2] = 0x3070DD17; + ctx->state[3] = 0xF70E5939; + ctx->state[4] = 0xFFC00B31; + ctx->state[5] = 0x68581511; + ctx->state[6] = 0x64f98FA7; + ctx->state[7] = 0xBEFA4FA4; +} + +/* Add bytes into the SHA-224 hash */ +void +SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len) +{ + + SHA256_Update((SHA256_CTX *)ctx, in, len); +} + +/* + * SHA-224 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad((SHA256_CTX *)ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#ifdef WEAK_REFS +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA256_Init +__weak_reference(_libmd_SHA256_Init, SHA256_Init); +#undef SHA256_Update +__weak_reference(_libmd_SHA256_Update, SHA256_Update); +#undef SHA256_Final +__weak_reference(_libmd_SHA256_Final, SHA256_Final); +#undef SHA256_Transform +__weak_reference(_libmd_SHA256_Transform, SHA256_Transform); + +#undef SHA224_Init +__weak_reference(_libmd_SHA224_Init, SHA224_Init); +#undef SHA224_Update +__weak_reference(_libmd_SHA224_Update, SHA224_Update); +#undef SHA224_Final +__weak_reference(_libmd_SHA224_Final, SHA224_Final); +#endif diff --git a/module/os/freebsd/spl/sha384.h b/module/os/freebsd/spl/sha384.h new file mode 100644 index 000000000..67250cee0 --- /dev/null +++ b/module/os/freebsd/spl/sha384.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA384_H_ +#define _SHA384_H_ + +#ifndef _KERNEL +#include <sys/types.h> +#endif + +#define SHA384_BLOCK_LENGTH 128 +#define SHA384_DIGEST_LENGTH 48 +#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA384Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA384_BLOCK_LENGTH]; +} SHA384_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA384_Init +#define SHA384_Init _libmd_SHA384_Init +#endif +#ifndef SHA384_Update +#define SHA384_Update _libmd_SHA384_Update +#endif +#ifndef SHA384_Final +#define SHA384_Final _libmd_SHA384_Final +#endif +#ifndef SHA384_End +#define SHA384_End _libmd_SHA384_End +#endif +#ifndef SHA384_Fd +#define SHA384_Fd _libmd_SHA384_Fd +#endif +#ifndef SHA384_FdChunk +#define SHA384_FdChunk _libmd_SHA384_FdChunk +#endif +#ifndef SHA384_File +#define SHA384_File _libmd_SHA384_File +#endif +#ifndef SHA384_FileChunk +#define SHA384_FileChunk _libmd_SHA384_FileChunk +#endif +#ifndef SHA384_Data +#define SHA384_Data _libmd_SHA384_Data +#endif + +#ifndef SHA384_version +#define SHA384_version _libmd_SHA384_version +#endif + +void SHA384_Init(SHA384_CTX *); +void SHA384_Update(SHA384_CTX *, const void *, size_t); +void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)], + SHA384_CTX *); +#ifndef _KERNEL +char *SHA384_End(SHA384_CTX *, char *); +char *SHA384_Data(const void *, unsigned int, char *); +char *SHA384_Fd(int, char *); +char *SHA384_FdChunk(int, char *, off_t, off_t); +char *SHA384_File(const char *, char *); +char *SHA384_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA384_H_ */ diff --git a/module/os/freebsd/spl/sha512.h b/module/os/freebsd/spl/sha512.h new file mode 100644 index 000000000..b6fb733ca --- /dev/null +++ b/module/os/freebsd/spl/sha512.h @@ -0,0 +1,101 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512_H_ +#define _SHA512_H_ + +#ifndef _KERNEL +#include <sys/types.h> +#endif + +#define SHA512_BLOCK_LENGTH 128 +#define SHA512_DIGEST_LENGTH 64 +#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA512Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA512_BLOCK_LENGTH]; +} SHA512_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#if 0 +#ifndef SHA512_Init +#define SHA512_Init _libmd_SHA512_Init +#endif +#ifndef SHA512_Update +#define SHA512_Update _libmd_SHA512_Update +#endif +#ifndef SHA512_Final +#define SHA512_Final _libmd_SHA512_Final +#endif +#endif +#ifndef SHA512_End +#define SHA512_End _libmd_SHA512_End +#endif +#ifndef SHA512_Fd +#define SHA512_Fd _libmd_SHA512_Fd +#endif +#ifndef SHA512_FdChunk +#define SHA512_FdChunk _libmd_SHA512_FdChunk +#endif +#ifndef SHA512_File +#define SHA512_File _libmd_SHA512_File +#endif +#ifndef SHA512_FileChunk +#define SHA512_FileChunk _libmd_SHA512_FileChunk +#endif +#ifndef SHA512_Data +#define SHA512_Data _libmd_SHA512_Data +#endif + +#ifndef SHA512_Transform +#define SHA512_Transform _libmd_SHA512_Transform +#endif +#ifndef SHA512_version +#define SHA512_version _libmd_SHA512_version +#endif + +void SHA512_Init(SHA512_CTX *); +void SHA512_Update(SHA512_CTX *, const void *, size_t); +void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_End(SHA512_CTX *, char *); +char *SHA512_Data(const void *, unsigned int, char *); +char *SHA512_Fd(int, char *); +char *SHA512_FdChunk(int, char *, off_t, off_t); +char *SHA512_File(const char *, char *); +char *SHA512_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512_H_ */ diff --git a/module/os/freebsd/spl/sha512c.c b/module/os/freebsd/spl/sha512c.c new file mode 100644 index 000000000..146f338f0 --- /dev/null +++ b/module/os/freebsd/spl/sha512c.c @@ -0,0 +1,508 @@ +/* + * Copyright 2005 Colin Percival + * Copyright (c) 2015 Allan Jude <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/endian.h> +#include <sys/types.h> + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <string.h> +#endif + +#include "sha512.h" +#include "sha512t.h" +#include "sha384.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint64_t into a vector of bytes */ +#define be64enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint64_t */ +#define be64dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint64_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 8. + */ +static void +be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + be64enc(dst + i * 8, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint64_t). Assumes len is a multiple of 8. + */ +static void +be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + dst[i] = be64dec(src + i * 8); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA512 round constants. */ +static const uint64_t K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, + 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL +}; + +/* Elementary functions used by SHA512 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (64 - n))) +#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39)) +#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41)) +#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7)) +#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6)) + +/* SHA512 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(80 - i) % 8], S[(81 - i) % 8], \ + S[(82 - i) % 8], S[(83 - i) % 8], \ + S[(84 - i) % 8], S[(85 - i) % 8], \ + S[(86 - i) % 8], S[(87 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA512 block compression function. The 512-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA512_Transform(uint64_t *state, + const unsigned char block[SHA512_BLOCK_LENGTH]) +{ + uint64_t W[80]; + uint64_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be64dec_vect(W, block, SHA512_BLOCK_LENGTH); + + /* 2. Initialize working variables. */ + memcpy(S, state, SHA512_DIGEST_LENGTH); + + /* 3. Mix. */ + for (i = 0; i < 80; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 64) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[SHA512_BLOCK_LENGTH] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA512_Pad(SHA512_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Pad to 112 mod 128, transforming if we finish a block en route. */ + if (r < 112) { + /* Pad to 112 mod 128. */ + memcpy(&ctx->buf[r], PAD, 112 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 128 - r); + SHA512_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 112); + } + + /* Add the terminating bit-count. */ + be64enc_vect(&ctx->buf[112], ctx->count, 16); + + /* Mix in the final block. */ + SHA512_Transform(ctx->state, ctx->buf); +} + +/* SHA-512 initialization. Begins a SHA-512 operation. */ +void +SHA512_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6a09e667f3bcc908ULL; + ctx->state[1] = 0xbb67ae8584caa73bULL; + ctx->state[2] = 0x3c6ef372fe94f82bULL; + ctx->state[3] = 0xa54ff53a5f1d36f1ULL; + ctx->state[4] = 0x510e527fade682d1ULL; + ctx->state[5] = 0x9b05688c2b3e6c1fULL; + ctx->state[6] = 0x1f83d9abfb41bd6bULL; + ctx->state[7] = 0x5be0cd19137e2179ULL; +} + +/* Add bytes into the hash */ +void +SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen[2]; + uint64_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint64_t)len) << 3; + bitlen[0] = ((uint64_t)len) >> 61; + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < SHA512_BLOCK_LENGTH - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r); + SHA512_Transform(ctx->state, ctx->buf); + src += SHA512_BLOCK_LENGTH - r; + len -= SHA512_BLOCK_LENGTH - r; + + /* Perform complete blocks */ + while (len >= SHA512_BLOCK_LENGTH) { + SHA512_Transform(ctx->state, src); + src += SHA512_BLOCK_LENGTH; + len -= SHA512_BLOCK_LENGTH; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-512 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-512t: ******************************************************** */ +/* + * the SHA512t transforms are identical to SHA512 so reuse the existing function + */ +void +SHA512_224_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x8c3d37c819544da2ULL; + ctx->state[1] = 0x73e1996689dcd4d6ULL; + ctx->state[2] = 0x1dfab7ae32ff9c82ULL; + ctx->state[3] = 0x679dd514582f9fcfULL; + ctx->state[4] = 0x0f6d2b697bd44da8ULL; + ctx->state[5] = 0x77e36f7304c48942ULL; + ctx->state[6] = 0x3f9d85a86a1d36c8ULL; + ctx->state[7] = 0x1112e6ad91d692a1ULL; +} + +void +SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH], + SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +void +SHA512_256_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x22312194fc2bf72cULL; + ctx->state[1] = 0x9f555fa3c84c64c2ULL; + ctx->state[2] = 0x2393b86b6f53b151ULL; + ctx->state[3] = 0x963877195940eabdULL; + ctx->state[4] = 0x96283ee2a88effe3ULL; + ctx->state[5] = 0xbe5e1e2553863992ULL; + ctx->state[6] = 0x2b0199fc2c85b8aaULL; + ctx->state[7] = 0x0eb72ddc81c52ca2ULL; +} + +void +SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH], + SHA512_CTX * ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* ** SHA-384: ******************************************************** */ +/* + * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped + */ + +/* SHA-384 initialization. Begins a SHA-384 operation. */ +void +SHA384_Init(SHA384_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->state[1] = 0x629a292a367cd507ULL; + ctx->state[2] = 0x9159015a3070dd17ULL; + ctx->state[3] = 0x152fecd8f70e5939ULL; + ctx->state[4] = 0x67332667ffc00b31ULL; + ctx->state[5] = 0x8eb44a8768581511ULL; + ctx->state[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->state[7] = 0x47b5481dbefa4fa4ULL; +} + +/* Add bytes into the SHA-384 hash */ +void +SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update((SHA512_CTX *)ctx, in, len); +} + +/* + * SHA-384 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad((SHA512_CTX *)ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#if 0 +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA512_Init +__weak_reference(_libmd_SHA512_Init, SHA512_Init); +#undef SHA512_Update +__weak_reference(_libmd_SHA512_Update, SHA512_Update); +#undef SHA512_Final +__weak_reference(_libmd_SHA512_Final, SHA512_Final); +#undef SHA512_Transform +__weak_reference(_libmd_SHA512_Transform, SHA512_Transform); + +#undef SHA512_224_Init +__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init); +#undef SHA512_224_Update +__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update); +#undef SHA512_224_Final +__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final); + +#undef SHA512_256_Init +__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init); +#undef SHA512_256_Update +__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update); +#undef SHA512_256_Final +__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final); + +#undef SHA384_Init +__weak_reference(_libmd_SHA384_Init, SHA384_Init); +#undef SHA384_Update +__weak_reference(_libmd_SHA384_Update, SHA384_Update); +#undef SHA384_Final +__weak_reference(_libmd_SHA384_Final, SHA384_Final); +#endif diff --git a/module/os/freebsd/spl/sha512t.h b/module/os/freebsd/spl/sha512t.h new file mode 100644 index 000000000..703867fc0 --- /dev/null +++ b/module/os/freebsd/spl/sha512t.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 Allan Jude <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512T_H_ +#define _SHA512T_H_ + +#include "sha512.h" + +#ifndef _KERNEL +#include <sys/types.h> +#endif + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1) +#define SHA512_256_DIGEST_LENGTH 32 +#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1) + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA512_224_Init +#define SHA512_224_Init _libmd_SHA512_224_Init +#endif +#ifndef SHA512_224_Update +#define SHA512_224_Update _libmd_SHA512_224_Update +#endif +#ifndef SHA512_224_Final +#define SHA512_224_Final _libmd_SHA512_224_Final +#endif +#ifndef SHA512_224_End +#define SHA512_224_End _libmd_SHA512_224_End +#endif +#ifndef SHA512_224_Fd +#define SHA512_224_Fd _libmd_SHA512_224_Fd +#endif +#ifndef SHA512_224_FdChunk +#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk +#endif +#ifndef SHA512_224_File +#define SHA512_224_File _libmd_SHA512_224_File +#endif +#ifndef SHA512_224_FileChunk +#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk +#endif +#ifndef SHA512_224_Data +#define SHA512_224_Data _libmd_SHA512_224_Data +#endif + +#ifndef SHA512_224_Transform +#define SHA512_224_Transform _libmd_SHA512_224_Transform +#endif +#ifndef SHA512_224_version +#define SHA512_224_version _libmd_SHA512_224_version +#endif + +#ifndef SHA512_256_Init +#define SHA512_256_Init _libmd_SHA512_256_Init +#endif +#ifndef SHA512_256_Update +#define SHA512_256_Update _libmd_SHA512_256_Update +#endif +#ifndef SHA512_256_Final +#define SHA512_256_Final _libmd_SHA512_256_Final +#endif +#ifndef SHA512_256_End +#define SHA512_256_End _libmd_SHA512_256_End +#endif +#ifndef SHA512_256_Fd +#define SHA512_256_Fd _libmd_SHA512_256_Fd +#endif +#ifndef SHA512_256_FdChunk +#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk +#endif +#ifndef SHA512_256_File +#define SHA512_256_File _libmd_SHA512_256_File +#endif +#ifndef SHA512_256_FileChunk +#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk +#endif +#ifndef SHA512_256_Data +#define SHA512_256_Data _libmd_SHA512_256_Data +#endif + +#ifndef SHA512_256_Transform +#define SHA512_256_Transform _libmd_SHA512_256_Transform +#endif +#ifndef SHA512_256_version +#define SHA512_256_version _libmd_SHA512_256_version +#endif + +void SHA512_224_Init(SHA512_CTX *); +void SHA512_224_Update(SHA512_CTX *, const void *, size_t); +void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_224_End(SHA512_CTX *, char *); +char *SHA512_224_Data(const void *, unsigned int, char *); +char *SHA512_224_Fd(int, char *); +char *SHA512_224_FdChunk(int, char *, off_t, off_t); +char *SHA512_224_File(const char *, char *); +char *SHA512_224_FileChunk(const char *, char *, off_t, off_t); +#endif +void SHA512_256_Init(SHA512_CTX *); +void SHA512_256_Update(SHA512_CTX *, const void *, size_t); +void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_256_End(SHA512_CTX *, char *); +char *SHA512_256_Data(const void *, unsigned int, char *); +char *SHA512_256_Fd(int, char *); +char *SHA512_256_FdChunk(int, char *, off_t, off_t); +char *SHA512_256_File(const char *, char *); +char *SHA512_256_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512T_H_ */ diff --git a/module/os/freebsd/spl/spl_acl.c b/module/os/freebsd/spl/spl_acl.c new file mode 100644 index 000000000..bb4c30728 --- /dev/null +++ b/module/os/freebsd/spl/spl_acl.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2008, 2009 Edward Tomasz NapieraÅ‚a <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/errno.h> +#include <sys/zfs_acl.h> +#include <sys/acl.h> + +struct zfs2bsd { + uint32_t zb_zfs; + int zb_bsd; +}; + +struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA}, + {ACE_WRITE_DATA, ACL_WRITE_DATA}, + {ACE_EXECUTE, ACL_EXECUTE}, + {ACE_APPEND_DATA, ACL_APPEND_DATA}, + {ACE_DELETE_CHILD, ACL_DELETE_CHILD}, + {ACE_DELETE, ACL_DELETE}, + {ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES}, + {ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES}, + {ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS}, + {ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS}, + {ACE_READ_ACL, ACL_READ_ACL}, + {ACE_WRITE_ACL, ACL_WRITE_ACL}, + {ACE_WRITE_OWNER, ACL_WRITE_OWNER}, + {ACE_SYNCHRONIZE, ACL_SYNCHRONIZE}, + {0, 0}}; + +struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE, + ACL_ENTRY_FILE_INHERIT}, + {ACE_DIRECTORY_INHERIT_ACE, + ACL_ENTRY_DIRECTORY_INHERIT}, + {ACE_NO_PROPAGATE_INHERIT_ACE, + ACL_ENTRY_NO_PROPAGATE_INHERIT}, + {ACE_INHERIT_ONLY_ACE, + ACL_ENTRY_INHERIT_ONLY}, + {ACE_INHERITED_ACE, + ACL_ENTRY_INHERITED}, + {ACE_SUCCESSFUL_ACCESS_ACE_FLAG, + ACL_ENTRY_SUCCESSFUL_ACCESS}, + {ACE_FAILED_ACCESS_ACE_FLAG, + ACL_ENTRY_FAILED_ACCESS}, + {0, 0}}; + +static int +_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + int bsd = 0; + + for (tmp = table; tmp->zb_zfs != 0; tmp++) { + if (zfs & tmp->zb_zfs) + bsd |= tmp->zb_bsd; + } + + return (bsd); +} + +static uint32_t +_zfs_from_bsd(int bsd, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + uint32_t zfs = 0; + + for (tmp = table; tmp->zb_bsd != 0; tmp++) { + if (bsd & tmp->zb_bsd) + zfs |= tmp->zb_zfs; + } + + return (zfs); +} + +int +acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries) +{ + int i; + struct acl_entry *entry; + const ace_t *ace; + + if (nentries < 1) { + printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n"); + return (EINVAL); + } + + if (nentries > ACL_MAX_ENTRIES) { + /* + * I believe it may happen only when moving a pool + * from SunOS to FreeBSD. + */ + printf("acl_from_aces: ZFS ACL too big to fit " + "into 'struct acl'; returning EINVAL.\n"); + return (EINVAL); + } + + bzero(aclp, sizeof (*aclp)); + aclp->acl_maxcnt = ACL_MAX_ENTRIES; + aclp->acl_cnt = nentries; + + for (i = 0; i < nentries; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + if (ace->a_flags & ACE_OWNER) + entry->ae_tag = ACL_USER_OBJ; + else if (ace->a_flags & ACE_GROUP) + entry->ae_tag = ACL_GROUP_OBJ; + else if (ace->a_flags & ACE_EVERYONE) + entry->ae_tag = ACL_EVERYONE; + else if (ace->a_flags & ACE_IDENTIFIER_GROUP) + entry->ae_tag = ACL_GROUP; + else + entry->ae_tag = ACL_USER; + + if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP) + entry->ae_id = ace->a_who; + else + entry->ae_id = ACL_UNDEFINED_ID; + + entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms); + entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags); + + switch (ace->a_type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM; + break; + default: + panic("acl_from_aces: a_type is 0x%x", ace->a_type); + } + } + + return (0); +} + +void +aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp) +{ + int i; + const struct acl_entry *entry; + ace_t *ace; + + bzero(aces, sizeof (*aces) * aclp->acl_cnt); + + *nentries = aclp->acl_cnt; + + for (i = 0; i < aclp->acl_cnt; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + ace->a_who = entry->ae_id; + + if (entry->ae_tag == ACL_USER_OBJ) + ace->a_flags = ACE_OWNER; + else if (entry->ae_tag == ACL_GROUP_OBJ) + ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP); + else if (entry->ae_tag == ACL_GROUP) + ace->a_flags = ACE_IDENTIFIER_GROUP; + else if (entry->ae_tag == ACL_EVERYONE) + ace->a_flags = ACE_EVERYONE; + else /* ACL_USER */ + ace->a_flags = 0; + + ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms); + ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags); + + switch (entry->ae_entry_type) { + case ACL_ENTRY_TYPE_ALLOW: + ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_DENY: + ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_ALARM: + ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_AUDIT: + ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE; + break; + default: + panic("aces_from_acl: ae_entry_type is 0x%x", + entry->ae_entry_type); + } + } +} diff --git a/module/os/freebsd/spl/spl_atomic.c b/module/os/freebsd/spl/spl_atomic.c new file mode 100644 index 000000000..e82fed847 --- /dev/null +++ b/module/os/freebsd/spl/spl_atomic.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/atomic.h> + +#ifdef _KERNEL +#include <sys/kernel.h> + +struct mtx atomic_mtx; +MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF); +#else +#include <pthread.h> + +#define mtx_lock(lock) pthread_mutex_lock(lock) +#define mtx_unlock(lock) pthread_mutex_unlock(lock) + +static pthread_mutex_t atomic_mtx; + +static __attribute__((constructor)) void +atomic_init(void) +{ + pthread_mutex_init(&atomic_mtx, NULL); +} +#endif + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) +void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + + mtx_lock(&atomic_mtx); + *target += delta; + mtx_unlock(&atomic_mtx); +} + +void +atomic_dec_64(volatile uint64_t *target) +{ + + mtx_lock(&atomic_mtx); + *target -= 1; + mtx_unlock(&atomic_mtx); +} +#endif + +uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + uint64_t newval; + + mtx_lock(&atomic_mtx); + newval = (*target += delta); + mtx_unlock(&atomic_mtx); + return (newval); +} + +#if defined(__powerpc__) || defined(__arm__) || defined(__mips__) +void +atomic_or_8(volatile uint8_t *target, uint8_t value) +{ + mtx_lock(&atomic_mtx); + *target |= value; + mtx_unlock(&atomic_mtx); +} +#endif + +uint8_t +atomic_or_8_nv(volatile uint8_t *target, uint8_t value) +{ + uint8_t newval; + + mtx_lock(&atomic_mtx); + newval = (*target |= value); + mtx_unlock(&atomic_mtx); + return (newval); +} + +uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ + uint64_t oldval; + + mtx_lock(&atomic_mtx); + oldval = *target; + if (oldval == cmp) + *target = newval; + mtx_unlock(&atomic_mtx); + return (oldval); +} + +uint32_t +atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval) +{ + uint32_t oldval; + + mtx_lock(&atomic_mtx); + oldval = *target; + if (oldval == cmp) + *target = newval; + mtx_unlock(&atomic_mtx); + return (oldval); +} + +void +membar_producer(void) +{ + /* nothing */ +} diff --git a/module/os/freebsd/spl/spl_cmn_err.c b/module/os/freebsd/spl/spl_cmn_err.c new file mode 100644 index 000000000..23566603f --- /dev/null +++ b/module/os/freebsd/spl/spl_cmn_err.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 John Birrell <[email protected]>. All rights reserved. + * Copyright 2012 Martin Matuska <[email protected]>. All rights reserved. + */ + +#include <sys/cmn_err.h> + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + char buf[256]; + const char *prefix; + + prefix = NULL; /* silence unwitty compilers */ + switch (ce) { + case CE_CONT: + prefix = "Solaris(cont): "; + break; + case CE_NOTE: + prefix = "Solaris: NOTICE: "; + break; + case CE_WARN: + prefix = "Solaris: WARNING: "; + break; + case CE_PANIC: + prefix = "Solaris(panic): "; + break; + case CE_IGNORE: + break; + default: + panic("Solaris: unknown severity level"); + } + if (ce == CE_PANIC) { + vsnprintf(buf, sizeof (buf), fmt, adx); + panic("%s%s", prefix, buf); + } + if (ce != CE_IGNORE) { + printf("%s", prefix); + vprintf(fmt, adx); + printf("\n"); + } +} + +void +cmn_err(int type, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(type, fmt, ap); + va_end(ap); +} diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c new file mode 100644 index 000000000..e7b2ff823 --- /dev/null +++ b/module/os/freebsd/spl/spl_dtrace.c @@ -0,0 +1,37 @@ +/* + * Copyright 2014 The FreeBSD Project. + * All rights reserved. + * + * This software was developed by Steven Hartland. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/sdt.h> + +/* CSTYLED */ +SDT_PROBE_DEFINE1(sdt, , , set__error, "int"); diff --git a/module/os/freebsd/spl/spl_kmem.c b/module/os/freebsd/spl/spl_kmem.c new file mode 100644 index 000000000..af3747c27 --- /dev/null +++ b/module/os/freebsd/spl/spl_kmem.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/byteorder.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/debug.h> +#include <sys/mutex.h> +#include <sys/vmmeter.h> + + +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> + +#ifdef KMEM_DEBUG +#include <sys/queue.h> +#include <sys/stack.h> +#endif + +#ifdef _KERNEL +MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris"); +#else +#define malloc(size, type, flags) malloc(size) +#define free(addr, type) free(addr) +#endif + +#ifdef KMEM_DEBUG +struct kmem_item { + struct stack stack; + LIST_ENTRY(kmem_item) next; +}; +static LIST_HEAD(, kmem_item) kmem_items; +static struct mtx kmem_items_mtx; +MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF); +#endif /* KMEM_DEBUG */ + +#include <sys/vmem.h> + +void * +zfs_kmem_alloc(size_t size, int kmflags) +{ + void *p; +#ifdef KMEM_DEBUG + struct kmem_item *i; + + size += sizeof (struct kmem_item); +#endif + p = malloc(MAX(size, 16), M_SOLARIS, kmflags); +#ifndef _KERNEL + if (kmflags & KM_SLEEP) + assert(p != NULL); +#endif +#ifdef KMEM_DEBUG + if (p != NULL) { + i = p; + p = (uint8_t *)p + sizeof (struct kmem_item); + stack_save(&i->stack); + mtx_lock(&kmem_items_mtx); + LIST_INSERT_HEAD(&kmem_items, i, next); + mtx_unlock(&kmem_items_mtx); + } +#endif + return (p); +} + +void +zfs_kmem_free(void *buf, size_t size __unused) +{ +#ifdef KMEM_DEBUG + if (buf == NULL) { + printf("%s: attempt to free NULL\n", __func__); + return; + } + struct kmem_item *i; + + buf = (uint8_t *)buf - sizeof (struct kmem_item); + mtx_lock(&kmem_items_mtx); + LIST_FOREACH(i, &kmem_items, next) { + if (i == buf) + break; + } + ASSERT(i != NULL); + LIST_REMOVE(i, next); + mtx_unlock(&kmem_items_mtx); + memset(buf, 0xDC, MAX(size, 16)); +#endif + free(buf, M_SOLARIS); +} + +static uint64_t kmem_size_val; + +static void +kmem_size_init(void *unused __unused) +{ + + kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE; + if (kmem_size_val > vm_kmem_size) + kmem_size_val = vm_kmem_size; +} +SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL); + +uint64_t +kmem_size(void) +{ + + return (kmem_size_val); +} + +static int +kmem_std_constructor(void *mem, int size __unused, void *private, int flags) +{ + struct kmem_cache *cache = private; + + return (cache->kc_constructor(mem, cache->kc_private, flags)); +} + +static void +kmem_std_destructor(void *mem, int size __unused, void *private) +{ + struct kmem_cache *cache = private; + + cache->kc_destructor(mem, cache->kc_private); +} + +kmem_cache_t * +kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags) +{ + kmem_cache_t *cache; + + ASSERT(vmp == NULL); + + cache = kmem_alloc(sizeof (*cache), KM_SLEEP); + strlcpy(cache->kc_name, name, sizeof (cache->kc_name)); + cache->kc_constructor = constructor; + cache->kc_destructor = destructor; + cache->kc_private = private; +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + cache->kc_zone = uma_zcreate(cache->kc_name, bufsize, + constructor != NULL ? kmem_std_constructor : NULL, + destructor != NULL ? kmem_std_destructor : NULL, + NULL, NULL, align > 0 ? align - 1 : 0, cflags); +#else + cache->kc_size = bufsize; +#endif + + return (cache); +} + +void +kmem_cache_destroy(kmem_cache_t *cache) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zdestroy(cache->kc_zone); +#endif + kmem_free(cache, sizeof (*cache)); +} + +void * +kmem_cache_alloc(kmem_cache_t *cache, int flags) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + return (uma_zalloc_arg(cache->kc_zone, cache, flags)); +#else + void *p; + + p = kmem_alloc(cache->kc_size, flags); + if (p != NULL && cache->kc_constructor != NULL) + kmem_std_constructor(p, cache->kc_size, cache, flags); + return (p); +#endif +} + +void +kmem_cache_free(kmem_cache_t *cache, void *buf) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zfree_arg(cache->kc_zone, buf, cache); +#else + if (cache->kc_destructor != NULL) + kmem_std_destructor(buf, cache->kc_size, cache); + kmem_free(buf, cache->kc_size); +#endif +} + +/* + * Allow our caller to determine if there are running reaps. + * + * This call is very conservative and may return B_TRUE even when + * reaping activity isn't active. If it returns B_FALSE, then reaping + * activity is definitely inactive. + */ +boolean_t +kmem_cache_reap_active(void) +{ + + return (B_FALSE); +} + +/* + * Reap (almost) everything soon. + * + * Note: this does not wait for the reap-tasks to complete. Caller + * should use kmem_cache_reap_active() (above) and/or moderation to + * avoid scheduling too many reap-tasks. + */ +#ifdef _KERNEL +void +kmem_cache_reap_soon(kmem_cache_t *cache) +{ +#ifndef KMEM_DEBUG +#if __FreeBSD_version >= 1300043 + uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); +#else + zone_drain(cache->kc_zone); +#endif +#endif +} + +void +kmem_reap(void) +{ +#if __FreeBSD_version >= 1300043 + uma_reclaim(UMA_RECLAIM_TRIM); +#else + uma_reclaim(); +#endif +} +#else +void +kmem_cache_reap_soon(kmem_cache_t *cache __unused) +{ +} + +void +kmem_reap(void) +{ +} +#endif + +int +kmem_debugging(void) +{ + return (0); +} + +void * +calloc(size_t n, size_t s) +{ + return (kmem_zalloc(n * s, KM_NOSLEEP)); +} + +char * +kmem_vasprintf(const char *fmt, va_list adx) +{ + char *msg; + va_list adx2; + + va_copy(adx2, adx); + msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); + (void) vsprintf(msg, fmt, adx2); + va_end(adx2); + + return (msg); +} + +#include <vm/uma.h> +#include <vm/uma_int.h> +#ifdef KMEM_DEBUG +#error "KMEM_DEBUG not currently supported" +#endif + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (uma_zone_get_cur(cache->kc_zone)); +} + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->kc_zone->uz_size); +} + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} + +#ifdef KMEM_DEBUG +void kmem_show(void *); +void +kmem_show(void *dummy __unused) +{ + struct kmem_item *i; + + mtx_lock(&kmem_items_mtx); + if (LIST_EMPTY(&kmem_items)) + printf("KMEM_DEBUG: No leaked elements.\n"); + else { + printf("KMEM_DEBUG: Leaked elements:\n\n"); + LIST_FOREACH(i, &kmem_items, next) { + printf("address=%p\n", i); + stack_print_ddb(&i->stack); + printf("\n"); + } + } + mtx_unlock(&kmem_items_mtx); +} + +SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL); +#endif /* KMEM_DEBUG */ diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c new file mode 100644 index 000000000..fda03a3d7 --- /dev/null +++ b/module/os/freebsd/spl/spl_kstat.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sysctl.h> +#include <sys/kstat.h> + +static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); + +SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +kstat_t * +__kstat_create(const char *module, int instance, const char *name, + const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags) +{ + struct sysctl_oid *root; + kstat_t *ksp; + + KASSERT(instance == 0, ("instance=%d", instance)); + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + /* + * Allocate the main structure. We don't need to copy module/class/name + * stuff in here, because it is only used for sysctl node creation + * done in this function. + */ + ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO); + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_instance = instance; + strncpy(ksp->ks_name, name, KSTAT_STRLEN); + strncpy(ksp->ks_class, class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = flags; + ksp->ks_update = kstat_default_update; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + panic("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + /* + * Create sysctl tree for those statistics: + * + * kstat.<module>.<class>.<name>. + */ + sysctl_ctx_init(&ksp->ks_sysctl_ctx); + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, + ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s tree!\n", __func__, module); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, class, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, + module, class); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, name, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, + module, class, name); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + ksp->ks_sysctl_root = root; + + return (ksp); +} + +static int +kstat_sysctl(SYSCTL_HANDLER_ARGS) +{ + kstat_named_t *ksent = arg1; + uint64_t val; + + val = ksent->value.ui64; + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +void +kstat_install(kstat_t *ksp) +{ + kstat_named_t *ksent; + char *namelast; + int typelast; + + ksent = ksp->ks_data; + if (ksp->ks_ndata == UINT32_MAX) { +#ifdef INVARIANTS + printf("can't handle raw ops yet!!!\n"); +#endif + return; + } + if (ksent == NULL) { + printf("%s ksp->ks_data == NULL!!!!\n", __func__); + return; + } + typelast = 0; + namelast = NULL; + for (int i = 0; i < ksp->ks_ndata; i++, ksent++) { + if (ksent->data_type != 0) { + typelast = ksent->data_type; + namelast = ksent->name; + } + switch (typelast) { + case KSTAT_DATA_INT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S32 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "I", + namelast); + break; + case KSTAT_DATA_UINT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U32 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "IU", + namelast); + break; + case KSTAT_DATA_INT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S64 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "Q", + namelast); + break; + case KSTAT_DATA_UINT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U64 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "QU", + namelast); + break; + default: + panic("unsupported type: %d", typelast); + } + + } +} + +void +kstat_delete(kstat_t *ksp) +{ + + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); +} + +void +kstat_set_string(char *dst, const char *src) +{ + + bzero(dst, KSTAT_STRLEN); + (void) strncpy(dst, src, KSTAT_STRLEN - 1); +} + +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) +{ + + kstat_set_string(knp->name, name); + knp->data_type = data_type; +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c new file mode 100644 index 000000000..ab4702574 --- /dev/null +++ b/module/os/freebsd/spl/spl_misc.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/limits.h> +#include <sys/misc.h> +#include <sys/sysctl.h> + +#include <sys/zfs_context.h> + +char hw_serial[11] = "0"; + +static struct opensolaris_utsname hw_utsname = { + .machine = MACHINE +}; + +static void +opensolaris_utsname_init(void *arg) +{ + + hw_utsname.sysname = ostype; + hw_utsname.nodename = prison0.pr_hostname; + hw_utsname.release = osrelease; + snprintf(hw_utsname.version, sizeof (hw_utsname.version), + "%d", osreldate); +} + +char * +kmem_strdup(const char *s) +{ + char *buf; + + buf = kmem_alloc(strlen(s) + 1, KM_SLEEP); + strcpy(buf, s); + return (buf); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vpanic(fmt, ap); + va_end(ap); +} + +utsname_t * +utsname(void) +{ + return (&hw_utsname); +} +SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, + opensolaris_utsname_init, NULL); diff --git a/module/os/freebsd/spl/spl_policy.c b/module/os/freebsd/spl/spl_policy.c new file mode 100644 index 000000000..53732836f --- /dev/null +++ b/module/os/freebsd/spl/spl_policy.c @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/mntent.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/jail.h> +#include <sys/policy.h> +#include <sys/zfs_vfsops.h> + + +int +secpolicy_nfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON)); +} + +int +secpolicy_zfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_sys_config(cred_t *cr, int checkonly __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG)); +} + +int +secpolicy_zinject(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT)); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT)); +} + +int +secpolicy_fs_owner(struct mount *mp, cred_t *cr) +{ + + if (zfs_super_owner) { + if (cr->cr_uid == mp->mnt_cred->cr_uid && + cr->cr_prison == mp->mnt_cred->cr_prison) { + return (0); + } + } + return (EPERM); +} + +/* + * This check is done in kern_link(), so we could just return 0 here. + */ +extern int hardlink_check_uid; +int +secpolicy_basic_link(vnode_t *vp, cred_t *cr) +{ + + if (!hardlink_check_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_LINK)); +} + +int +secpolicy_vnode_stky_modify(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_remove(vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0) + return (EACCES); + if ((accmode & VWRITE) && + spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) { + return (EACCES); + } + if (accmode & VEXEC) { + if (vp->v_type == VDIR) { + if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0) + return (EACCES); + } else { + if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0) + return (EACCES); + } + } + return (0); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + */ +int +secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode) +{ + accmode_t mode; + + mode = ~curmode & wantmode; + + if (mode == 0) + return (0); + + return (secpolicy_vnode_access(cr, vp, owner, mode)); +} + +int +secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner) +{ + static int privs[] = { + PRIV_VFS_ADMIN, + PRIV_VFS_READ, + PRIV_VFS_WRITE, + PRIV_VFS_EXEC, + PRIV_VFS_LOOKUP + }; + int i; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* Same as secpolicy_vnode_setdac */ + if (owner == cr->cr_uid) + return (0); + + for (i = 0; i < sizeof (privs)/sizeof (int); i++) { + int priv; + + switch (priv = privs[i]) { + case PRIV_VFS_EXEC: + if (vp->v_type == VDIR) + continue; + break; + case PRIV_VFS_LOOKUP: + if (vp->v_type != VDIR) + continue; + break; + } + if (spl_priv_check_cred(cr, priv) == 0) + return (0); + } + return (EPERM); +} + +int +secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + int mask = vap->va_mask; + int error; + + if (mask & AT_SIZE) { + if (vp->v_type == VDIR) + return (EISDIR); + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + if (mask & AT_MODE) { + /* + * If not the owner of the file then check privilege + * for two things: the privilege to set the mode at all + * and, if we're setting setuid, we also need permissions + * to add the set-uid bit, if we're not the owner. + * In the specific case of creating a set-uid root + * file, we need even more permissions. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr); + if (error) + return (error); + } else { + vap->va_mode = ovap->va_mode; + } + if (mask & (AT_UID | AT_GID)) { + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + + /* + * To change the owner of a file, or change the group of + * a file to a group of which we are not a member, the + * caller must have privilege. + */ + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid && + !groupmember(vap->va_gid, cr))) { + if (secpolicy_fs_owner(vp->v_mount, cr) != 0) { + error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN); + if (error) + return (error); + } + } + + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { + secpolicy_setid_clear(vap, vp, cr); + } + } + if (mask & (AT_ATIME | AT_MTIME)) { + /* + * From utimes(2): + * If times is NULL, ... The caller must be the owner of + * the file, have permission to write the file, or be the + * super-user. + * If times is non-NULL, ... The caller must be the owner of + * the file or be the super-user. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error && (vap->va_vaflags & VA_UTIMES_NULL)) + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_vnode_create_gid(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) +{ + + if (groupmember(gid, cr)) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SETGID)); +} + +int +secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, + boolean_t issuidroot __unused) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); +} + +void +secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return; + + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) { + if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } + } +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr) +{ + int error; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the process + * is not a member of. Both of these are allowed in jail(8). + */ + if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) { + if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE)) + return (EFTYPE); + } + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0) { + error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid); + if (error) + return (error); + } + /* + * Deny setting setuid if we are not the file owner. + */ + if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) { + error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* XXX: vfs_suser()? */ + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER)); +} + +int +secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN)); +} + +void +secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp) +{ + + if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) { + MNT_ILOCK(vfsp); + vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER; + vfs_clearmntopt(vfsp, MNTOPT_SETUID); + vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0); + MNT_IUNLOCK(vfsp); + } +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS)); +} + +int +secpolicy_smb(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NETSMB)); +} diff --git a/module/os/freebsd/spl/spl_procfs_list.c b/module/os/freebsd/spl/spl_procfs_list.c new file mode 100644 index 000000000..7b4ae9d0e --- /dev/null +++ b/module/os/freebsd/spl/spl_procfs_list.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/list.h> +#include <sys/mutex.h> +#include <sys/procfs_list.h> + +void +seq_printf(struct seq_file *m, const char *fmt, ...) +{} + +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} diff --git a/module/os/freebsd/spl/spl_string.c b/module/os/freebsd/spl/spl_string.c new file mode 100644 index 000000000..14d816b5c --- /dev/null +++ b/module/os/freebsd/spl/spl_string.c @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/param.h> +#include <sys/string.h> +#include <sys/kmem.h> +#include <machine/stdarg.h> + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +char * +strpbrk(const char *s, const char *b) +{ + const char *p; + + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + + return (NULL); +} + +/* + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, size_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +/* + * Do not change the length of the returned string; it must be freed + * with strfree(). + */ +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + (void) vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +void +kmem_strfree(char *str) +{ + ASSERT(str != NULL); + kmem_free(str, strlen(str) + 1); +} diff --git a/module/os/freebsd/spl/spl_sunddi.c b/module/os/freebsd/spl/spl_sunddi.c new file mode 100644 index 000000000..1fa4f56f1 --- /dev/null +++ b/module/os/freebsd/spl/spl_sunddi.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/limits.h> +#include <sys/misc.h> +#include <sys/sunddi.h> +#include <sys/sysctl.h> + +int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + + *result = strtol(str, nptr, base); + return (0); +} + +int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + + if (str == hw_serial) { + *result = prison0.pr_hostid; + return (0); + } + + *result = strtoul(str, nptr, base); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) +{ + + *result = (unsigned long long)strtouq(str, nptr, base); + return (0); +} + +int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + + *result = (long long)strtoq(str, nptr, base); + return (0); +} diff --git a/module/os/freebsd/spl/spl_sysevent.c b/module/os/freebsd/spl/spl_sysevent.c new file mode 100644 index 000000000..d3748276a --- /dev/null +++ b/module/os/freebsd/spl/spl_sysevent.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek <[email protected]> + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kmem.h> +#include <sys/sbuf.h> +#include <sys/nvpair.h> +#include <sys/sunddi.h> +#include <sys/sysevent.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/bus.h> + +static int +log_sysevent(nvlist_t *event) +{ + struct sbuf *sb; + const char *type; + char typestr[128]; + nvpair_t *elem = NULL; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + type = NULL; + + while ((elem = nvlist_next_nvpair(event, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + { + boolean_t value; + + (void) nvpair_value_boolean_value(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), + value ? "true" : "false"); + break; + } + case DATA_TYPE_UINT8: + { + uint8_t value; + + (void) nvpair_value_uint8(elem, &value); + sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); + break; + } + case DATA_TYPE_INT32: + { + int32_t value; + + (void) nvpair_value_int32(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT32: + { + uint32_t value; + + (void) nvpair_value_uint32(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_INT64: + { + int64_t value; + + (void) nvpair_value_int64(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT64: + { + uint64_t value; + + (void) nvpair_value_uint64(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_STRING: + { + char *value; + + (void) nvpair_value_string(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); + if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) + type = value; + break; + } + case DATA_TYPE_UINT8_ARRAY: + { + uint8_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint8_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%02hhx", value[ii]); + break; + } + case DATA_TYPE_UINT16_ARRAY: + { + uint16_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint16_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%04hx", value[ii]); + break; + } + case DATA_TYPE_UINT32_ARRAY: + { + uint32_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint32_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_INT64_ARRAY: + { + int64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_int64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016lld", + (long long)value[ii]); + break; + } + case DATA_TYPE_UINT64_ARRAY: + { + uint64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_STRING_ARRAY: + { + char **strarr; + uint_t ii, nelem; + + (void) nvpair_value_string_array(elem, &strarr, &nelem); + + for (ii = 0; ii < nelem; ii++) { + if (strarr[ii] == NULL) { + sbuf_printf(sb, " <NULL>"); + continue; + } + + sbuf_printf(sb, " %s", strarr[ii]); + if (strcmp(FM_CLASS, strarr[ii]) == 0) + type = strarr[ii]; + } + break; + } + case DATA_TYPE_NVLIST: + /* XXX - requires recursing in log_sysevent */ + break; + default: + printf("%s: type %d is not implemented\n", __func__, + nvpair_type(elem)); + break; + } + } + + if (sbuf_finish(sb) != 0) { + sbuf_delete(sb); + return (ENOMEM); + } + + if (type == NULL) + type = ""; + if (strncmp(type, "ESC_ZFS_", 8) == 0) { + snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8); + type = typestr; + } + devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); + sbuf_delete(sb); + + return (0); +} + +static void +sysevent_worker(void *arg __unused) +{ + zfs_zevent_t *ze; + nvlist_t *event; + uint64_t dropped = 0; + uint64_t dst_size; + int error; + + zfs_zevent_init(&ze); + for (;;) { + dst_size = 131072; + dropped = 0; + event = NULL; + error = zfs_zevent_next(ze, &event, + &dst_size, &dropped); + if (error) { + error = zfs_zevent_wait(ze); + if (error == ESHUTDOWN) + break; + } else { + VERIFY(event != NULL); + log_sysevent(event); + nvlist_free(event); + } + } + zfs_zevent_destroy(ze); + kthread_exit(); +} + +void +ddi_sysevent_init(void) +{ + kproc_kthread_add(sysevent_worker, NULL, &zfsproc, NULL, 0, 0, + "zfskern", "sysevent"); +} diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c new file mode 100644 index 000000000..b6a501f67 --- /dev/null +++ b/module/os/freebsd/spl/spl_taskq.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2009 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/kmem.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/taskq.h> +#include <sys/zfs_context.h> + +#include <vm/uma.h> + +static uint_t taskq_tsd; +static uma_zone_t taskq_zone; + +taskq_t *system_taskq = NULL; +taskq_t *system_delay_taskq = NULL; +taskq_t *dynamic_taskq = NULL; + +extern int uma_align_cache; + +#define TQ_MASK uma_align_cache +#define TQ_PTR_MASK ~uma_align_cache + +#define TIMEOUT_TASK 1 +#define NORMAL_TASK 2 + +static int +taskqent_init(void *mem, int size, int flags) +{ + bzero(mem, sizeof (taskq_ent_t)); + return (0); +} + +static int +taskqent_ctor(void *mem, int size, void *arg, int flags) +{ + return (0); +} + +static void +taskqent_dtor(void *mem, int size, void *arg) +{ + taskq_ent_t *ent = mem; + + ent->tqent_gen = (ent->tqent_gen + 1) & TQ_MASK; +} + +static void +system_taskq_init(void *arg) +{ + + tsd_create(&taskq_tsd, NULL); + taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t), + taskqent_ctor, taskqent_dtor, taskqent_init, NULL, + UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); + system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri, + 0, 0, 0); + system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus, + minclsyspri, 0, 0, 0); +} +SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, + NULL); + +static void +system_taskq_fini(void *arg) +{ + + taskq_destroy(system_taskq); + uma_zdestroy(taskq_zone); + tsd_destroy(&taskq_tsd); +} +SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, + NULL); + +static void +taskq_tsd_set(void *context) +{ + taskq_t *tq = context; + + tsd_set(taskq_tsd, tq); +} + +static taskq_t * +taskq_create_with_init(const char *name, int nthreads, pri_t pri, + int minalloc __unused, int maxalloc __unused, uint_t flags) +{ + taskq_t *tq; + + if ((flags & TASKQ_THREADS_CPU_PCT) != 0) + nthreads = MAX((mp_ncpus * nthreads) / 100, 1); + + tq = kmem_alloc(sizeof (*tq), KM_SLEEP); + tq->tq_queue = taskqueue_create(name, M_WAITOK, + taskqueue_thread_enqueue, &tq->tq_queue); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, + taskq_tsd_set, tq); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, + taskq_tsd_set, NULL); + (void) taskqueue_start_threads(&tq->tq_queue, nthreads, pri, + "%s", name); + + return ((taskq_t *)tq); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, + int maxalloc __unused, uint_t flags) +{ + + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, proc_t *proc __unused, uint_t flags) +{ + + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags)); +} + +void +taskq_destroy(taskq_t *tq) +{ + + taskqueue_free(tq->tq_queue); + kmem_free(tq, sizeof (*tq)); +} + +int +taskq_member(taskq_t *tq, kthread_t *thread) +{ + + return (taskqueue_member(tq->tq_queue, thread)); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} + +int +taskq_cancel_id(taskq_t *tq, taskqid_t tid) +{ + uint32_t pend; + int rc; + taskq_ent_t *ent = (void*)(tid & TQ_PTR_MASK); + + if (ent == NULL) + return (0); + if ((tid & TQ_MASK) != ent->tqent_gen) + return (0); + if (ent->tqent_type == TIMEOUT_TASK) { + rc = taskqueue_cancel_timeout(tq->tq_queue, + &ent->tqent_timeout_task, &pend); + } else + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) + taskq_wait_id(tq, tid); + else + uma_zfree(taskq_zone, ent); + return (rc); +} + +static void +taskq_run(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + task->tqent_func(task->tqent_arg); + uma_zfree(taskq_zone, task); +} + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskq_ent_t *task; + taskqid_t tid; + clock_t timo; + int mflag; + + timo = expire_time - ddi_get_lbolt(); + if (timo <= 0) + return (taskq_dispatch(tq, func, arg, flags)); + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + tid = (uintptr_t)task; + MPASS((tid & TQ_MASK) == 0); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_type = TIMEOUT_TASK; + tid |= task->tqent_gen; + TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, + taskq_run, task); + + taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task, + timo); + return (tid); +} + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *task; + int mflag, prio; + taskqid_t tid; + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + + tid = (uintptr_t)task; + MPASS((tid & TQ_MASK) == 0); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_type = NORMAL_TASK; + TASK_INIT(&task->tqent_task, prio, taskq_run, task); + tid |= task->tqent_gen; + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); + return (tid); +} + +static void +taskq_run_ent(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + task->tqent_func(task->tqent_arg); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, + taskq_ent_t *task) +{ + int prio; + + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + + task->tqent_func = func; + task->tqent_arg = arg; + + TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); +} + +void +taskq_wait(taskq_t *tq) +{ + taskqueue_quiesce(tq->tq_queue); +} + +void +taskq_wait_id(taskq_t *tq, taskqid_t tid) +{ + taskq_ent_t *ent = (void*)(tid & TQ_PTR_MASK); + + if ((tid & TQ_MASK) != ent->tqent_gen) + return; + + taskqueue_drain(tq->tq_queue, &ent->tqent_task); +} + +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) +{ + taskqueue_drain_all(tq->tq_queue); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (t->tqent_task.ta_pending == 0); +} diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c new file mode 100644 index 000000000..05dbfd06d --- /dev/null +++ b/module/os/freebsd/spl/spl_uio.c @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +/* + * $FreeBSD$ + */ + +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/vnode.h> + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +int +uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) +{ + struct iovec small_iovec[1]; + struct uio small_uio_clone; + struct uio *uio_clone; + int error; + + ASSERT3U(uio->uio_rw, ==, rw); + if (uio->uio_iovcnt == 1) { + small_uio_clone = *uio; + small_iovec[0] = *uio->uio_iov; + small_uio_clone.uio_iov = small_iovec; + uio_clone = &small_uio_clone; + } else { + uio_clone = cloneuio(uio); + } + + error = vn_io_fault_uiomove(p, n, uio_clone); + *cbytes = uio->uio_resid - uio_clone->uio_resid; + if (uio_clone != &small_uio_clone) + free(uio_clone, M_IOV); + return (error); +} + +/* + * Drop the next n chars out of *uiop. + */ +void +uioskip(uio_t *uio, size_t n) +{ + enum uio_seg segflg; + + /* For the full compatibility with illumos. */ + if (n > uio->uio_resid) + return; + + segflg = uio->uio_segflg; + uio->uio_segflg = UIO_NOCOPY; + uiomove(NULL, n, uio->uio_rw, uio); + uio->uio_segflg = segflg; +} diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c new file mode 100644 index 000000000..99da8c976 --- /dev/null +++ b/module/os/freebsd/spl/spl_vfs.c @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/cred.h> +#include <sys/vfs.h> +#include <sys/priv.h> +#include <sys/libkern.h> + +#include <sys/mutex.h> +#include <sys/vnode.h> + +MALLOC_DECLARE(M_MOUNT); + +void +vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused) +{ + struct vfsopt *opt; + size_t namesize; + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + + if (vfsp->mnt_opt == NULL) { + void *opts; + + MNT_IUNLOCK(vfsp); + opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK); + MNT_ILOCK(vfsp); + if (vfsp->mnt_opt == NULL) { + vfsp->mnt_opt = opts; + TAILQ_INIT(vfsp->mnt_opt); + } else { + free(opts, M_MOUNT); + } + } + + MNT_IUNLOCK(vfsp); + + opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK); + namesize = strlen(name) + 1; + opt->name = malloc(namesize, M_MOUNT, M_WAITOK); + strlcpy(opt->name, name, namesize); + opt->pos = -1; + opt->seen = 1; + if (arg == NULL) { + opt->value = NULL; + opt->len = 0; + } else { + opt->len = strlen(arg) + 1; + opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); + bcopy(arg, opt->value, opt->len); + } + + MNT_ILOCK(vfsp); + TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +void +vfs_clearmntopt(vfs_t *vfsp, const char *name) +{ + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + vfs_deleteopt(vfsp->mnt_opt, name); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +int +vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) +{ + struct vfsoptlist *opts = vfsp->mnt_optnew; + int error; + + if (opts == NULL) + return (0); + error = vfs_getopt(opts, opt, (void **)argp, NULL); + return (error != 0 ? 0 : 1); +} + +int +mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, + char *fspec, int fsflags) +{ + struct vfsconf *vfsp; + struct mount *mp; + vnode_t *vp, *mvp; + struct ucred *cr; + int error; + + ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); + + vp = *vpp; + *vpp = NULL; + error = 0; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + error = ENAMETOOLONG; + if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) + error = ENODEV; + if (error == 0 && vp->v_type != VDIR) + error = ENOTDIR; + /* + * We need vnode lock to protect v_mountedhere and vnode interlock + * to protect v_iflag. + */ + if (error == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) + vp->v_iflag |= VI_MOUNT; + else + error = EBUSY; + VI_UNLOCK(vp); + } + if (error != 0) { + vput(vp); + return (error); + } + VOP_UNLOCK1(vp); + + /* + * Allocate and initialize the filesystem. + * We don't want regular user that triggered snapshot mount to be able + * to unmount it, so pass credentials of the parent mount. + */ + mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred); + + mp->mnt_optnew = NULL; + vfs_setmntopt(mp, "from", fspec, 0); + mp->mnt_optnew = mp->mnt_opt; + mp->mnt_opt = NULL; + + /* + * Set the mount level flags. + */ + mp->mnt_flag = fsflags & MNT_UPDATEMASK; + /* + * Snapshots are always read-only. + */ + mp->mnt_flag |= MNT_RDONLY; + /* + * We don't want snapshots to allow access to vulnerable setuid + * programs, so we turn off setuid when mounting snapshots. + */ + mp->mnt_flag |= MNT_NOSUID; + /* + * We don't want snapshots to be visible in regular + * mount(8) and df(1) output. + */ + mp->mnt_flag |= MNT_IGNORE; + /* + * XXX: This is evil, but we can't mount a snapshot as a regular user. + * XXX: Is is safe when snapshot is mounted from within a jail? + */ + cr = td->td_ucred; + td->td_ucred = kcred; + error = VFS_MOUNT(mp); + td->td_ucred = cr; + + if (error != 0) { + /* + * Clear VI_MOUNT and decrement the use count "atomically", + * under the vnode lock. This is not strictly required, + * but makes it easier to reason about the life-cycle and + * ownership of the covered vnode. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vput(vp); + vfs_unbusy(mp); + vfs_freeopts(mp->mnt_optnew); + mp->mnt_vnodecovered = NULL; + vfs_mount_destroy(mp); + return (error); + } + + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + (void) VFS_STATFS(mp, &mp->mnt_stat); + + /* + * Prevent external consumers of mount options from reading + * mnt_optnew. + */ + mp->mnt_optnew = NULL; + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +#ifdef FREEBSD_NAMECACHE + cache_purge(vp); +#endif + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + + vp->v_mountedhere = mp; + /* Put the new filesystem on the mount list. */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + vfs_event_signal(NULL, VQ_MOUNT, 0); + if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) + panic("mount: lost mount"); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300048 + vfs_op_exit(mp); +#endif + vfs_unbusy(mp); + *vpp = mvp; + return (0); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY(vp->v_count > 0); + if (refcount_release_if_not_last(&vp->v_usecount)) { +#if __FreeBSD_version < 1300045 + vdrop(vp); +#endif + return; + } + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)vrele, vp, TQ_SLEEP) != 0); +} diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c new file mode 100644 index 000000000..cd18ebb7a --- /dev/null +++ b/module/os/freebsd/spl/spl_vm.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/byteorder.h> +#include <sys/lock.h> +#include <sys/freebsd_rwlock.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> + +const int zfs_vm_pagerret_bad = VM_PAGER_BAD; +const int zfs_vm_pagerret_error = VM_PAGER_ERROR; +const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; +const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; + +void +zfs_vmobject_assert_wlocked(vm_object_t object) +{ + + /* + * This is not ideal because FILE/LINE used by assertions will not + * be too helpful, but it must be an hard function for + * compatibility reasons. + */ + VM_OBJECT_ASSERT_WLOCKED(object); +} + +void +zfs_vmobject_wlock(vm_object_t object) +{ + + VM_OBJECT_WLOCK(object); +} + +void +zfs_vmobject_wunlock(vm_object_t object) +{ + + VM_OBJECT_WUNLOCK(object); +} diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c new file mode 100644 index 000000000..7549483d8 --- /dev/null +++ b/module/os/freebsd/spl/spl_zlib.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/zmod.h> +#if __FreeBSD_version >= 1300041 +#include <contrib/zlib/zlib.h> +#else +#include <sys/zlib.h> +#endif +#include <sys/kobj.h> + + +/*ARGSUSED*/ +static void * +zcalloc(void *opaque, uint_t items, uint_t size) +{ + + return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT)); +} + +/*ARGSUSED*/ +static void +zcfree(void *opaque, void *ptr) +{ + + free(ptr, M_SOLARIS); +} + +static int +zlib_deflateInit(z_stream *stream, int level) +{ + + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (deflateInit(stream, level)); +} + +static int +zlib_deflate(z_stream *stream, int flush) +{ + return (deflate(stream, flush)); +} + +static int +zlib_deflateEnd(z_stream *stream) +{ + return (deflateEnd(stream)); +} + +static int +zlib_inflateInit(z_stream *stream) +{ + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (inflateInit(stream)); +} + +static int +zlib_inflate(z_stream *stream, int finish) +{ +#if __FreeBSD_version >= 1300024 + return (inflate(stream, finish)); +#else + return (_zlib104_inflate(stream, finish)); +#endif +} + + +static int +zlib_inflateEnd(z_stream *stream) +{ + return (inflateInit(stream)); +} + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + // return (kmem_cache_alloc(zlib_workspace_cache, flags)); + return (NULL); +} + +static void +zlib_workspace_free(void *workspace) +{ + // kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + stream.opaque = NULL; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err); +} + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + return (err); +} + +#if 0 +int +spl_zlib_init(void) +{ + int size; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + return (1); + + return (0); +} + +void +spl_zlib_fini(void) +{ + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; +} +#endif diff --git a/module/os/freebsd/spl/spl_zone.c b/module/os/freebsd/spl/spl_zone.c new file mode 100644 index 000000000..40f21934e --- /dev/null +++ b/module/os/freebsd/spl/spl_zone.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/malloc.h> +#include <sys/queue.h> +#include <sys/jail.h> +#include <sys/osd.h> +#include <sys/priv.h> +#include <sys/zone.h> + +#include <sys/policy.h> + +static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data"); + +/* + * Structure to record list of ZFS datasets exported to a zone. + */ +typedef struct zone_dataset { + LIST_ENTRY(zone_dataset) zd_next; + char zd_dataset[0]; +} zone_dataset_t; + +LIST_HEAD(zone_dataset_head, zone_dataset); + +static int zone_slot; + +int +zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd, *zd2; + struct prison *pr; + int dofree, error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + /* Allocate memory before we grab prison's mutex. */ + zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); /* Locks &pr->pr_mtx. */ + sx_sunlock(&allprison_lock); + if (pr == NULL) { + free(zd, M_ZONES); + return (ENOENT); + } + + head = osd_jail_get(pr, zone_slot); + if (head != NULL) { + dofree = 0; + LIST_FOREACH(zd2, head, zd_next) { + if (strcmp(dataset, zd2->zd_dataset) == 0) { + free(zd, M_ZONES); + error = EEXIST; + goto end; + } + } + } else { + dofree = 1; + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + head = malloc(sizeof (*head), M_ZONES, M_WAITOK); + LIST_INIT(head); + mtx_lock(&pr->pr_mtx); + error = osd_jail_set(pr, zone_slot, head); + KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", + error)); + } + strcpy(zd->zd_dataset, dataset); + LIST_INSERT_HEAD(head, zd, zd_next); +end: + if (dofree) + prison_free_locked(pr); + else + mtx_unlock(&pr->pr_mtx); + return (error); +} + +int +zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + int error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) { + error = ENOENT; + goto end; + } + LIST_FOREACH(zd, head, zd_next) { + if (strcmp(dataset, zd->zd_dataset) == 0) + break; + } + if (zd == NULL) + error = ENOENT; + else { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + if (LIST_EMPTY(head)) + osd_jail_del(pr, zone_slot); + error = 0; + } +end: + mtx_unlock(&pr->pr_mtx); + return (error); +} + +/* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + size_t len; + int ret = 0; + + if (dataset[0] == '\0') + return (0); + if (INGLOBALZONE(curproc)) { + if (write != NULL) + *write = 1; + return (1); + } + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) + goto end; + + /* + * Walk the list once, looking for datasets which match exactly, or + * specify a dataset underneath an exported dataset. If found, return + * true and note that it is writable. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(zd->zd_dataset); + if (strlen(dataset) >= len && + bcmp(dataset, zd->zd_dataset, len) == 0 && + (dataset[len] == '\0' || dataset[len] == '/' || + dataset[len] == '@')) { + if (write) + *write = 1; + ret = 1; + goto end; + } + } + + /* + * Walk the list a second time, searching for datasets which are parents + * of exported datasets. These should be visible, but read-only. + * + * Note that we also have to support forms such as 'pool/dataset/', with + * a trailing slash. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(dataset); + if (dataset[len - 1] == '/') + len--; /* Ignore trailing slash */ + if (len < strlen(zd->zd_dataset) && + bcmp(dataset, zd->zd_dataset, len) == 0 && + zd->zd_dataset[len] == '/') { + if (write) + *write = 0; + ret = 1; + goto end; + } + } +end: + mtx_unlock(&pr->pr_mtx); + return (ret); +} + +static void +zone_destroy(void *arg) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + + head = arg; + while ((zd = LIST_FIRST(head)) != NULL) { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + } + free(head, M_ZONES); +} + +uint32_t +zone_get_hostid(void *ptr) +{ + + KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__)); + + return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid); +} + +boolean_t +in_globalzone(struct proc *p) +{ + return (!jailed(FIRST_THREAD_IN_PROC((p))->td_ucred)); +} + +static void +zone_sysinit(void *arg __unused) +{ + + zone_slot = osd_jail_register(zone_destroy, NULL); +} + +static void +zone_sysuninit(void *arg __unused) +{ + + osd_jail_deregister(zone_slot); +} + +SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); +SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL); diff --git a/module/os/freebsd/zfs/abd.c b/module/os/freebsd/zfs/abd.c new file mode 100644 index 000000000..888a113a4 --- /dev/null +++ b/module/os/freebsd/zfs/abd.c @@ -0,0 +1,1134 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include <sys/abd.h> +#include <sys/param.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +/* + * It is possible to make all future ABDs be linear by setting this to B_FALSE. + * Otherwise, ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear(). + */ +boolean_t zfs_abd_scatter_enabled = B_TRUE; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +extern inline boolean_t abd_is_linear(abd_t *abd); +extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); +extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); +extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_zero(abd_t *abd, size_t size); + +static void * +abd_alloc_chunk() +{ + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + return (c); +} + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(size_t chunkcnt) +{ + size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, size); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + size_t n = abd_chunkcnt_for_bytes(size); + abd_t *abd = abd_alloc_struct(n); + + abd->abd_flags = ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + void *c = abd_alloc_chunk(); + ASSERT3P(c, !=, NULL); + abd->abd_u.abd_scatter.abd_chunks[i] = c; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); + } + + zfs_refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + zfs_refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + if (abd == NULL) + return; + + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +/* ARGSUSED */ +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_struct(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd->abd_u.abd_scatter.abd_offset = + new_offset % zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, + &sabd->abd_u.abd_scatter.abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + } + + if (size == 0) + abd->abd_size = sabd->abd_size - off; + else + abd->abd_size = size; + abd->abd_parent = sabd; + zfs_refcount_create(&abd->abd_children); + (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + + return (abd_get_offset_impl(sabd, off, 0)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + if (abd == NULL) + return; + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +struct abd_iter { + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; /* position (relative to abd_offset) */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ +}; + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + while (size > 0) { + abd_iter_map(&aiter); + + size_t len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd); + abd_iter_init(&saiter, sabd); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + size_t dlen = MIN(daiter.iter_mapsize, size); + size_t slen = MIN(saiter.iter_mapsize, size); + size_t len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i]); + + if (dabd) + abd_iter_init(&daiter, dabd); + + ASSERT3S(dsize, >=, 0); + + critical_enter(); + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + case 2: + len = MIN(caiters[1].iter_mapsize, len); + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } + critical_exit(); +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i]); + abd_iter_init(&xiters[i], tabds[i]); + } + + critical_enter(); + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } + critical_exit(); +} diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c new file mode 100644 index 000000000..f0c272447 --- /dev/null +++ b/module/os/freebsd/zfs/arc_os.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/spa_impl.h> +#include <sys/zio_compress.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#include <sys/vdev.h> +#include <sys/vdev_trim.h> +#include <sys/vdev_impl.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/multilist.h> +#include <sys/abd.h> +#include <sys/zil.h> +#include <sys/fm/fs/zfs.h> +#include <sys/eventhandler.h> +#include <sys/callb.h> +#include <sys/kstat.h> +#include <sys/zthr.h> +#include <zfs_fletcher.h> +#include <sys/arc_impl.h> +#include <sys/sdt.h> +#include <sys/aggsum.h> +#include <cityhash.h> + +extern struct vfsops zfs_vfsops; + +/* vmem_size typemask */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 +#define VMEM_MAXFREE 0x10 +typedef size_t vmem_size_t; +extern vmem_size_t vmem_size(vmem_t *vm, int typemask); + +uint_t zfs_arc_free_target = 0; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n __unused; + free_memory_reason_t r = FMR_UNKNOWN; + +#ifdef _KERNEL + /* + * Cooperate with pagedaemon when it's time for it to scan + * and reclaim some pages. + */ + n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + n = uma_avail() - (long)(uma_limit() / 4); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; + } +#endif + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. + * + * Note that reducing the arc_zio_arena_free_shift keeps more virtual + * memory (in the zio_arena) free, which can avoid memory + * fragmentation issues. + */ + if (zio_arena != NULL) { + n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> + arc_zio_arena_free_shift); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } + +#else /* _KERNEL */ + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; +#endif /* _KERNEL */ + + last_free_memory = lowest; + last_free_reason = r; + DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + return (lowest); +} + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + uint64_t size; + + if (allmem >= 1 << 30) + size = allmem - (1 << 30); + else + size = min; + return (MAX(allmem * 5 / 8, size)); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *arg) +{ + int64_t nr_scan = *(int64_t *)arg; + + arc_reduce_target_size(ptob(nr_scan)); + free(arg, M_TEMP); + vnlru_free(nr_scan, &zfs_vfsops); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + + int64_t *adjustptr; + + if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL) + return; + + *adjustptr = adjust; + taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP); + ARCSTAT_BUMP(arcstat_prune); +} + +uint64_t +arc_all_memory(void) +{ + return ((uint64_t)ptob(physmem)); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_free_memory(void) +{ + /* XXX */ + return (0); +} + +static eventhandler_tag arc_event_lowmem = NULL; + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + int64_t free_memory, to_free; + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); + to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); + arc_reduce_target_size(to_free); + + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + + /* + * It is unsafe to block here in arbitrary threads, because we can come + * here from ARC itself and may hold ARC locks and thus risk a deadlock + * with ARC reclaim thread. + */ + if (curproc == pageproc) + (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); + mutex_exit(&arc_adjust_lock); +} + +void +arc_lowmem_init(void) +{ + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); + +} + +void +arc_lowmem_fini(void) +{ + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +} diff --git a/module/os/freebsd/zfs/crypto_os.c b/module/os/freebsd/zfs/crypto_os.c new file mode 100644 index 000000000..cc86074c2 --- /dev/null +++ b/module/os/freebsd/zfs/crypto_os.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2005-2010 Pawel Jakub Dawidek <[email protected]> + * Copyright (c) 2018 Sean Eric Fagan <[email protected]> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> + +#ifdef _KERNEL +#include <sys/libkern.h> +#include <sys/malloc.h> +#include <sys/sysctl.h> +#include <opencrypto/cryptodev.h> +#include <opencrypto/xform.h> +#else +#include <strings.h> +#endif + +#include <sys/zio_crypt.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> + +#include <sys/freebsd_crypto.h> + +#define SHA512_HMAC_BLOCK_SIZE 128 + +static int crypt_sessions = 0; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD, + &crypt_sessions, 0, "Number of cryptographic sessions created"); + +void +crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key) +{ + uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE], + k_opad[SHA512_HMAC_BLOCK_SIZE], + key[SHA512_HMAC_BLOCK_SIZE]; + SHA512_CTX lctx; + int i; + size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length); + + /* + * This code is based on the similar code in geom/eli/g_eli_hmac.c + */ + explicit_bzero(key, sizeof (key)); + if (c_key->ck_length == 0) + /* do nothing */; + else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE) + bcopy(c_key->ck_data, key, cl_bytes); + else { + /* + * If key is longer than 128 bytes reset it to + * key = SHA512(key). + */ + SHA512_Init(&lctx); + SHA512_Update(&lctx, c_key->ck_data, cl_bytes); + SHA512_Final(key, &lctx); + } + + /* XOR key with ipad and opad values. */ + for (i = 0; i < sizeof (key); i++) { + k_ipad[i] = key[i] ^ 0x36; + k_opad[i] = key[i] ^ 0x5c; + } + explicit_bzero(key, sizeof (key)); + + /* Start inner SHA512. */ + SHA512_Init(&ctx->innerctx); + SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad)); + explicit_bzero(k_ipad, sizeof (k_ipad)); + /* Start outer SHA512. */ + SHA512_Init(&ctx->outerctx); + SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad)); + explicit_bzero(k_opad, sizeof (k_opad)); +} + +void +crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize) +{ + SHA512_Update(&ctx->innerctx, data, datasize); +} + +void +crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize) +{ + uint8_t digest[SHA512_DIGEST_LENGTH]; + + /* Complete inner hash */ + SHA512_Final(digest, &ctx->innerctx); + + /* Complete outer hash */ + SHA512_Update(&ctx->outerctx, digest, sizeof (digest)); + SHA512_Final(digest, &ctx->outerctx); + + explicit_bzero(ctx, sizeof (*ctx)); + /* mdsize == 0 means "Give me the whole hash!" */ + if (mdsize == 0) + mdsize = SHA512_DIGEST_LENGTH; + bcopy(digest, md, mdsize); + explicit_bzero(digest, sizeof (digest)); +} + +void +crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size, + void *out_data, size_t out_data_size) +{ + struct hmac_ctx ctx; + + crypto_mac_init(&ctx, key); + crypto_mac_update(&ctx, in_data, in_data_size); + crypto_mac_final(&ctx, out_data, out_data_size); +} + +static int +freebsd_zfs_crypt_done(struct cryptop *crp) +{ + freebsd_crypt_session_t *ses; + + ses = crp->crp_opaque; + mtx_lock(&ses->fs_lock); + ses->fs_done = true; + mtx_unlock(&ses->fs_lock); + wakeup(crp); + return (0); +} + +void +freebsd_crypt_freesession(freebsd_crypt_session_t *sess) +{ + mtx_destroy(&sess->fs_lock); + crypto_freesession(sess->fs_sid); + explicit_bzero(sess, sizeof (*sess)); +} + +static int +zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp) +{ + int error; + + crp->crp_opaque = session; + crp->crp_callback = freebsd_zfs_crypt_done; + for (;;) { + error = crypto_dispatch(crp); + if (error) + break; + mtx_lock(&session->fs_lock); + while (session->fs_done == false) + msleep(crp, &session->fs_lock, PRIBIO, + "zfs_crypto", hz/5); + mtx_unlock(&session->fs_lock); + + if (crp->crp_etype != EAGAIN) { + error = crp->crp_etype; + break; + } + crp->crp_etype = 0; + crp->crp_flags &= ~CRYPTO_F_DONE; + session->fs_done = false; +#if __FreeBSD_version < 1300087 + /* + * Session ID changed, so we should record that, + * and try again + */ + session->fs_sid = crp->crp_session; +#endif + } + return (error); +} +static void +freebsd_crypt_uio_debug_log(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ +#ifdef FCRYPTO_DEBUG + struct cryptodesc *crd; + uint8_t *p = NULL; + size_t total = 0; + + printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, " + "%p, %u, %u)\n", + __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + data_uio, key->ck_format, key->ck_data, + (unsigned int)key->ck_length, + ivbuf, (unsigned int)datalen, (unsigned int)auth_len); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); + for (int i = 0; i < data_uio->uio_iovcnt; i++) { + printf("\tiovec #%d: <%p, %u>\n", i, + data_uio->uio_iov[i].iov_base, + (unsigned int)data_uio->uio_iov[i].iov_len); + total += data_uio->uio_iov[i].iov_len; + } + data_uio->uio_resid = total; +#endif +} +/* + * Create a new cryptographic session. This should + * happen every time the key changes (including when + * it's first loaded). + */ +#if __FreeBSD_version >= 1300087 +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct crypto_session_params csp; + int error = 0; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + bzero(&csp, sizeof (csp)); + csp.csp_mode = CSP_MODE_AEAD; + csp.csp_cipher_key = key->ck_data; + csp.csp_cipher_klen = key->ck_length / 8; + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16; + csp.csp_ivlen = AES_GCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + case AES_192_GMAC_KEY_LEN: + case AES_256_GMAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + csp.csp_cipher_alg = CRYPTO_AES_CCM_16; + csp.csp_ivlen = AES_CCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + case AES_192_CBC_MAC_KEY_LEN: + case AES_256_CBC_MAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + error = crypto_newsession(&sessp->fs_sid, &csp, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} + +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + freebsd_crypt_session_t *session = NULL; + int error = 0; + size_t total = 0; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + for (int i = 0; i < data_uio->uio_iovcnt; i++) + total += data_uio->uio_iov[i].iov_len; + data_uio->uio_resid = total; + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(session->fs_sid, M_WAITOK); + if (encrypt) { + crp->crp_op = CRYPTO_OP_ENCRYPT | + CRYPTO_OP_COMPUTE_DIGEST; + } else { + crp->crp_op = CRYPTO_OP_DECRYPT | + CRYPTO_OP_VERIFY_DIGEST; + } + crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE; + crp->crp_buf_type = CRYPTO_BUF_UIO; + crp->crp_uio = (void*)data_uio; + crp->crp_ilen = data_uio->uio_resid; + + crp->crp_aad_start = 0; + crp->crp_aad_length = auth_len; + crp->crp_payload_start = auth_len; + crp->crp_payload_length = datalen; + crp->crp_digest_start = auth_len + datalen; + + bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN); + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } + return (error); +} + +#else +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct cryptoini cria, crie, *crip; + struct enc_xform *xform; + struct auth_hash *xauth; + int error = 0; + crypto_session_t sid; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + bzero(&crie, sizeof (crie)); + bzero(&cria, sizeof (cria)); + + crie.cri_alg = xform->type; + crie.cri_key = key->ck_data; + crie.cri_klen = key->ck_length; + + cria.cri_alg = xauth->type; + cria.cri_key = key->ck_data; + cria.cri_klen = key->ck_length; + + cria.cri_next = &crie; + crie.cri_next = NULL; + crip = &cria; + // Everything else is bzero'd + + error = crypto_newsession(&sid, crip, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + if (error != 0) { + printf("%s(%d): crypto_newsession failed with %d\n", + __FUNCTION__, __LINE__, error); + goto bad; + } + sessp->fs_sid = sid; + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: + return (error); +} + +/* + * The meat of encryption/decryption. + * If sessp is NULL, then it will create a + * temporary cryptographic session, and release + * it when done. + */ +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + struct cryptodesc *enc_desc, *auth_desc; + struct enc_xform *xform; + struct auth_hash *xauth; + freebsd_crypt_session_t *session = NULL; + int error; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(2); + if (crp == NULL) { + error = ENOMEM; + goto bad; + } + + auth_desc = crp->crp_desc; + enc_desc = auth_desc->crd_next; + + crp->crp_session = session->fs_sid; + crp->crp_ilen = auth_len + datalen; + crp->crp_buf = (void*)data_uio; + crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC; + + auth_desc->crd_skip = 0; + auth_desc->crd_len = auth_len; + auth_desc->crd_inject = auth_len + datalen; + auth_desc->crd_alg = xauth->type; +#ifdef FCRYPTO_DEBUG + printf("%s: auth: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len, + auth_desc->crd_inject); +#endif + + enc_desc->crd_skip = auth_len; + enc_desc->crd_len = datalen; + enc_desc->crd_inject = auth_len; + enc_desc->crd_alg = xform->type; + enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; + bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN); + enc_desc->crd_next = NULL; + +#ifdef FCRYPTO_DEBUG + printf("%s: enc: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len, + enc_desc->crd_inject); +#endif + + if (encrypt) + enc_desc->crd_flags |= CRD_F_ENCRYPT; + + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} +#endif diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c new file mode 100644 index 000000000..268c843e5 --- /dev/null +++ b/module/os/freebsd/zfs/dmu_os.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_prop.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/sa.h> +#include <sys/zfeature.h> +#include <sys/abd.h> +#include <sys/zfs_rlock.h> +#include <sys/racct.h> +#include <sys/vm.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_vnops.h> + + +#ifndef IDX_TO_OFF +#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) +#endif + +#if __FreeBSD_version < 1300051 +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY +#else +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY +#endif + + +#if __FreeBSD_version < 1300072 +#define dmu_page_lock(m) vm_page_lock(m) +#define dmu_page_unlock(m) vm_page_unlock(m) +#else +#define dmu_page_lock(m) +#define dmu_page_unlock(m) +#endif + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + vm_page_t *ma, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + struct sf_buf *sf; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(ptoa((*ma)->pindex), ==, + db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*ma, &sf); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(sf); + ma += 1; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size) +{ + struct sf_buf *sf; + vm_object_t vmobj; + vm_page_t m; + dmu_buf_t **dbp; + dmu_buf_t *db; + caddr_t va; + int numbufs, i; + int bufoff, pgoff, tocpy; + int mi, di; + int err; + + ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); + ASSERT(last_size <= PAGE_SIZE); + + err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), + IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); + if (err != 0) + return (err); + +#ifdef DEBUG + IMPLY(last_size < PAGE_SIZE, *rahead == 0); + if (dbp[0]->db_offset != 0 || numbufs > 1) { + for (i = 0; i < numbufs; i++) { + ASSERT(ISP2(dbp[i]->db_size)); + ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); + ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); + } + } +#endif + + vmobj = ma[0]->object; + zfs_vmobject_wlock(vmobj); + + db = dbp[0]; + for (i = 0; i < *rbehind; i++) { + m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, PAGESIZE); + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rbehind = i; + + bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; + pgoff = 0; + for (mi = 0, di = 0; mi < count && di < numbufs; ) { + if (pgoff == 0) { + m = ma[mi]; + if (m != bogus_page) { + vm_page_assert_xbusied(m); + ASSERT(vm_page_none_valid(m)); + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + va = zfs_map_page(m, &sf); + } + } + if (bufoff == 0) + db = dbp[di]; + + if (m != bogus_page) { + ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, + db->db_offset + bufoff); + } + + /* + * We do not need to clamp the copy size by the file + * size as the last block is zero-filled beyond the + * end of file anyway. + */ + tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); + if (m != bogus_page) + bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); + + pgoff += tocpy; + ASSERT(pgoff <= PAGESIZE); + if (pgoff == PAGESIZE) { + if (m != bogus_page) { + zfs_unmap_page(sf); + vm_page_valid(m); + } + ASSERT(mi < count); + mi++; + pgoff = 0; + } + + bufoff += tocpy; + ASSERT(bufoff <= db->db_size); + if (bufoff == db->db_size) { + ASSERT(di < numbufs); + di++; + bufoff = 0; + } + } + +#ifdef DEBUG + /* + * Three possibilities: + * - last requested page ends at a buffer boundary and , thus, + * all pages and buffers have been iterated; + * - all requested pages are filled, but the last buffer + * has not been exhausted; + * the read-ahead is possible only in this case; + * - all buffers have been read, but the last page has not been + * fully filled; + * this is only possible if the file has only a single buffer + * with a size that is not a multiple of the page size. + */ + if (mi == count) { + ASSERT(di >= numbufs - 1); + IMPLY(*rahead != 0, di == numbufs - 1); + IMPLY(*rahead != 0, bufoff != 0); + ASSERT(pgoff == 0); + } + if (di == numbufs) { + ASSERT(mi >= count - 1); + ASSERT(*rahead == 0); + IMPLY(pgoff == 0, mi == count); + if (pgoff != 0) { + ASSERT(mi == count - 1); + ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); + } + } +#endif + if (pgoff != 0) { + ASSERT(m != bogus_page); + bzero(va + pgoff, PAGESIZE - pgoff); + zfs_unmap_page(sf); + vm_page_valid(m); + } + + for (i = 0; i < *rahead; i++) { + m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + tocpy = MIN(db->db_size - bufoff, PAGESIZE); + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, tocpy); + if (tocpy < PAGESIZE) { + ASSERT(i == *rahead - 1); + ASSERT((db->db_size & PAGE_MASK) != 0); + bzero(va + tocpy, PAGESIZE - tocpy); + } + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rahead = i; + zfs_vmobject_wunlock(vmobj); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} diff --git a/module/os/freebsd/zfs/hkdf.c b/module/os/freebsd/zfs/hkdf.c new file mode 100644 index 000000000..8324ff231 --- /dev/null +++ b/module/os/freebsd/zfs/hkdf.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/hkdf.h> +#include <sys/freebsd_crypto.h> +#include <sys/hkdf.h> + +static int +hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, + uint_t km_len, uint8_t *out_buf) +{ + crypto_key_t key; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(salt_len); + key.ck_data = salt; + + crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH); + + return (0); +} + +static int +hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, + uint8_t *out_buf, uint_t out_len) +{ + struct hmac_ctx ctx; + crypto_key_t key; + uint_t i, T_len = 0, pos = 0; + uint8_t c; + uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH; + uint8_t T[SHA512_DIGEST_LENGTH]; + + if (N > 255) + return (SET_ERROR(EINVAL)); + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); + key.ck_data = extract_key; + + for (i = 1; i <= N; i++) { + c = i; + + crypto_mac_init(&ctx, &key); + crypto_mac_update(&ctx, T, T_len); + crypto_mac_update(&ctx, info, info_len); + crypto_mac_update(&ctx, &c, 1); + crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH); + bcopy(T, out_buf + pos, + (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); + pos += SHA512_DIGEST_LENGTH; + } + + return (0); +} + +/* + * HKDF is designed to be a relatively fast function for deriving keys from a + * master key + a salt. We use this function to generate new encryption keys + * so as to avoid hitting the cryptographic limits of the underlying + * encryption modes. Note that, for the sake of deriving encryption keys, the + * info parameter is called the "salt" everywhere else in the code. + */ +int +hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len) +{ + int ret; + uint8_t extract_key[SHA512_DIGEST_LENGTH]; + + ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len, + extract_key); + if (ret != 0) + return (ret); + + ret = hkdf_sha512_expand(extract_key, info, info_len, output_key, + out_len); + if (ret != 0) + return (ret); + + return (0); +} diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c new file mode 100644 index 000000000..10807afa3 --- /dev/null +++ b/module/os/freebsd/zfs/kmod_core.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/eventhandler.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/fm/util.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/nvpair.h> +#include <sys/mount.h> +#include <sys/taskqueue.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_bookmark.h> +#include <sys/dsl_userhold.h> +#include <sys/zfeature.h> +#include <sys/zcp.h> +#include <sys/zio_checksum.h> +#include <sys/vdev_removal.h> +#include <sys/dsl_crypt.h> + +#include <sys/zfs_ioctl_compat.h> +#include <sys/zfs_ioctl_impl.h> + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_comutil.h" + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_DECL(_vfs_zfs_vdev); + + +static int zfs_version_ioctl = ZFS_IOCVER_ZOF; +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, + 0, "ZFS_IOCTL_VERSION"); + +static struct cdev *zfsdev; + +extern void zfs_init(void); +extern void zfs_fini(void); +extern void zfs_ioctl_init(void); + + +static struct root_hold_token *zfs_root_token; + +extern uint_t rrw_tsd_key; +extern uint_t zfs_allow_log_key; +extern uint_t zfs_geom_probe_vdev_key; + +static int zfs__init(void); +static int zfs__fini(void); +static void zfs_shutdown(void *, int); + +static eventhandler_tag zfs_shutdown_event_tag; +extern zfsdev_state_t *zfsdev_state_list; + +#define ZFS_MIN_KSTACK_PAGES 4 + +static void +zfs_cmd_bsd12_to_zof(zfs_cmd_legacy_t *src, zfs_cmd_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + memcpy(&dst->zc_begin_record, &src->zc_begin_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_begin_record)); + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_zoneid = src->zc_jailid; +} + +static void +zfs_cmd_zof_to_bsd12(zfs_cmd_t *src, zfs_cmd_legacy_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + memcpy(&dst->zc_begin_record, &src->zc_begin_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_begin_record)); + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_jailid = src->zc_zoneid; +} + +static int +zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, + struct thread *td) +{ + uint_t len, vecnum; + zfs_iocparm_t *zp; + zfs_cmd_t *zc; + zfs_cmd_legacy_t *zcl; + int rc, error; + void *uaddr; + + len = IOCPARM_LEN(zcmd); + vecnum = zcmd & 0xff; + zp = (void *)arg; + uaddr = (void *)zp->zfs_cmd; + error = 0; + zcl = NULL; + + if (len != sizeof (zfs_iocparm_t)) { + printf("len %d vecnum: %d sizeof (zfs_cmd_t) %lu\n", + len, vecnum, sizeof (zfs_cmd_t)); + return (EINVAL); + } + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + /* + * Remap ioctl code for legacy user binaries + */ + if (zp->zfs_ioctl_version == ZFS_IOCVER_FREEBSD) { + if (vecnum >= sizeof (zfs_ioctl_bsd12_to_zof)/sizeof (long)) { + kmem_free(zc, sizeof (zfs_cmd_t)); + return (ENOTSUP); + } + zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); + vecnum = zfs_ioctl_bsd12_to_zof[vecnum]; + if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + zfs_cmd_bsd12_to_zof(zcl, zc); + } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + error = zfsdev_ioctl_common(vecnum, zc); + if (zcl) { + zfs_cmd_zof_to_bsd12(zc, zcl); + rc = copyout(zcl, uaddr, sizeof (*zcl)); + } else { + rc = copyout(zc, uaddr, sizeof (*zc)); + } + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); +out: + if (zcl) + kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +static void +zfsdev_close(void *data) +{ + zfsdev_state_t *zs, *zsp = data; + + mutex_enter(&zfsdev_state_lock); + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs == zsp) + break; + } + if (zs == NULL || zs->zs_minor <= 0) { + mutex_exit(&zfsdev_state_lock); + return; + } + zs->zs_minor = -1; + zfs_onexit_destroy(zs->zs_onexit); + zfs_zevent_destroy(zs->zs_zevent); + mutex_exit(&zfsdev_state_lock); +} + +static int +zfs_ctldev_init(struct cdev *devp) +{ + boolean_t newzs = B_FALSE; + minor_t minor; + zfsdev_state_t *zs, *zsprev = NULL; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == -1) + break; + zsprev = zs; + } + + if (!zs) { + zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + newzs = B_TRUE; + } + + devfs_set_cdevpriv(zs, zfsdev_close); + + zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); + zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); + + if (newzs) { + zs->zs_minor = minor; + wmb(); + zsprev->zs_next = zs; + } else { + wmb(); + zs->zs_minor = minor; + } + return (0); +} + +static int +zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&zfsdev_state_lock); + + return (error); +} + +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_open = zfsdev_open, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DRIVER +}; + +int +zfsdev_attach(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, + ZFS_DRIVER); + return (0); +} + +void +zfsdev_detach(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +int +zfs__init(void) +{ + int error; + +#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES + printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " + "overflow panic!\nPlease consider adding " + "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, + ZFS_MIN_KSTACK_PAGES); +#endif + zfs_root_token = root_mount_hold("ZFS"); + if ((error = zfs_kmod_init()) != 0) { + printf("ZFS: Failed to Load ZFS Filesystem" + ", rc = %d\n", error); + root_mount_rel(zfs_root_token); + return (error); + } + + + tsd_create(&zfs_geom_probe_vdev_key, NULL); + + printf("ZFS storage pool version: features support (" + SPA_VERSION_STRING ")\n"); + root_mount_rel(zfs_root_token); + ddi_sysevent_init(); + return (0); +} + +int +zfs__fini(void) +{ + if (zfs_busy() || zvol_busy() || + zio_injection_enabled) { + return (EBUSY); + } + zfs_kmod_fini(); + tsd_destroy(&zfs_geom_probe_vdev_key); + return (0); +} + +static void +zfs_shutdown(void *arg __unused, int howto __unused) +{ + + /* + * ZFS fini routines can not properly work in a panic-ed system. + */ + if (panicstr == NULL) + zfs__fini(); +} + + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = zfs__init(); + if (err == 0) + zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( + shutdown_post_sync, zfs_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + return (err); + case MOD_UNLOAD: + err = zfs__fini(); + if (err == 0 && zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + return (err); + case MOD_SHUTDOWN: + return (0); + default: + break; + } + return (EOPNOTSUPP); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; + +#ifdef _KERNEL +EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); +#endif + +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); +MODULE_VERSION(zfsctrl, 1); +MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); +MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); +MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1); +MODULE_DEPEND(zfsctrl, cryptodev, 1, 1, 1); diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c new file mode 100644 index 000000000..ed124a5fa --- /dev/null +++ b/module/os/freebsd/zfs/spa_os.c @@ -0,0 +1,280 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska <[email protected]>. All rights reserved. + */ + + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/ddt.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_removal.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/bpobj.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_objset.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/fs/zfs.h> +#include <sys/arc.h> +#include <sys/callb.h> +#include <sys/spa_boot.h> +#include <sys/zfs_ioctl.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_userhold.h> +#include <sys/zfeature.h> +#include <sys/zvol.h> +#include <sys/abd.h> +#include <sys/callb.h> +#include <sys/zone.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t **configs, **tops; + nvlist_t *config; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; + uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; + + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) + return (NULL); + + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, + &txg) == 0); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + + nchildren = 1; + nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + nvlist_dup(nvtop, &tops[i], KM_SLEEP); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, + holes[i]) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, + 0) == 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, + i) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, + 0) == 0); + } + + /* + * Create pool config based on the best vdev config. + */ + nvlist_dup(best_cfg, &config, KM_SLEEP); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + tops, nchildren) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); + nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); + + for (i = 0; i < count; i++) + nvlist_free(configs[i]); + kmem_free(configs, count * sizeof (void *)); + for (i = 0; i < nchildren; i++) + nvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof (void *)); + nvlist_free(nvroot); + return (config); +} + +int +spa_import_rootpool(const char *name) +{ + spa_t *spa; + vdev_t *rvd; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); + + mutex_enter(&spa_namespace_lock); + if (config != NULL) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) + == 0); + + if ((spa = spa_lookup(pname)) != NULL) { + /* + * The pool could already be imported, + * e.g., after reboot -r. + */ + if (spa->spa_state == POOL_STATE_ACTIVE) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + return (0); + } + + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + + /* + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); + } + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (0); +} + +const char * +spa_history_zone(void) +{ + return ("freebsd"); +} diff --git a/module/os/freebsd/zfs/spa_stats.c b/module/os/freebsd/zfs/spa_stats.c new file mode 100644 index 000000000..45c880ada --- /dev/null +++ b/module/os/freebsd/zfs/spa_stats.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/spa.h> +#include <zfs_comutil.h> + +void +spa_stats_init(spa_t *spa) +{ + +} + +void +spa_stats_destroy(spa_t *spa) +{ + +} + +void +spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed) +{ +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) +{ +} + +void +spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) +{ + +} +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + return (0); +} + +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + return (NULL); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + +} + +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + +} + +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + return (0); +} + +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + return (0); +} diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c new file mode 100644 index 000000000..ea9c1b3f1 --- /dev/null +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/nvpair.h> +#include <sys/mount.h> +#include <sys/taskqueue.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_bookmark.h> +#include <sys/dsl_userhold.h> +#include <sys/zfeature.h> +#include <sys/zcp.h> +#include <sys/zio_checksum.h> +#include <sys/vdev_removal.h> +#include <sys/dsl_crypt.h> + +#include <sys/zfs_ioctl_compat.h> +#include <sys/zfs_context.h> + +#include <sys/arc_impl.h> +#include <sys/dsl_pool.h> + +#include <../zfs_config.h> + +/* BEGIN CSTYLED */ +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS events"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "space allocation"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "reconstruct"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "multihost protection"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "metaslab group"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "lua"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "l2arc"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS Adaptive Replacement Cache"); + +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV Mirror"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "livelist state"); +SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "condense knobs"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "receive knobs"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "send knobs"); + +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, + (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); + +extern arc_state_t ARC_anon; +extern arc_state_t ARC_mru; +extern arc_state_t ARC_mru_ghost; +extern arc_state_t ARC_mfu; +extern arc_state_t ARC_mfu_ghost; +extern arc_state_t ARC_l2c_only; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ + +/* arc.c */ + +/* legacy compat */ +extern unsigned long l2arc_write_max; /* def max write size */ +extern unsigned long l2arc_write_boost; /* extra warmup write */ +extern unsigned long l2arc_headroom; /* # of dev writes */ +extern unsigned long l2arc_headroom_boost; +extern unsigned long l2arc_feed_secs; /* interval seconds */ +extern unsigned long l2arc_feed_min_ms; /* min interval msecs */ +extern int l2arc_noprefetch; /* don't cache prefetch bufs */ +extern int l2arc_feed_again; /* turbo warmup */ +extern int l2arc_norw; /* no reads during writes */ + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, + &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, + &l2arc_feed_again, 0, "turbo warmup (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, + &l2arc_norw, 0, "no reads during writes (LEGACY)"); +#if 0 +extern int zfs_compressed_arc_enabled; +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW, + &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)"); +#endif + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); + +extern int arc_no_grow_shift; +extern int arc_shrink_shift; + +extern arc_stats_t arc_stats; +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ +#define arc_tempreserve ARCSTAT(arcstat_tempreserve) +#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ +#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ + +static int +sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) +{ + uint32_t val; + int err; + + val = arc_no_grow_shift; + err = sysctl_handle_32(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val >= arc_shrink_shift) + return (EINVAL); + + arc_no_grow_shift = val; + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, + 0, sizeof (uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", + "log2(fraction of ARC which must be free to allow growing)"); +/* dbuf.c */ + + +/* dmu.c */ + +/* dmu_zfetch.c */ +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); + +/* max bytes to prefetch per stream (default 8MB) */ +extern uint32_t zfetch_max_distance; +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); + +/* max bytes to prefetch indirects for per stream (default 64MB) */ +extern uint32_t zfetch_max_idistance; +SYSCTL_UINT(_vfs_zfs_prefetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, + &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); + +/* dsl_pool.c */ + +/* dnode.c */ +extern int zfs_default_bs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, + &zfs_default_bs, 0, "Default dnode block shift"); + +extern int zfs_default_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, + &zfs_default_ibs, 0, "Default dnode indirect block shift"); + + +/* dsl_scan.c */ + +/* metaslab.c */ + +/* + * Enable/disable lba weighting (i.e. outer tracks are given preference). + */ +extern boolean_t metaslab_lba_weighting_enabled; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting, CTLFLAG_RWTUN, + &metaslab_lba_weighting_enabled, 0, + "Enable LBA weighting (i.e. outer tracks are given preference)"); + + +/* + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. So a sane default for the space map block size + * is 8~16K. + */ +extern int zfs_metaslab_sm_blksz_no_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_no_log, 0, + "Block size for space map in pools with log space map disabled. " + "Power of 2 and greater than 4096."); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +extern int zfs_metaslab_sm_blksz_with_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_with_log, 0, + "Block size for space map in pools with log space map enabled. " + "Power of 2 and greater than 4096."); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +extern int zfs_condense_pct; +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, + &zfs_condense_pct, 0, + "Condense on-disk spacemap when it is more than this many percents" + " of in-memory counterpart"); + +extern int zfs_remove_max_segment; +SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, + &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to" + " allocate when removing a device"); + +extern int zfs_removal_suspend_progress; +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, + &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while" + " in the middle of a removal"); + + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +extern uint64_t metaslab_df_alloc_threshold; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, + &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change it's allocation strategy"); + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +extern int metaslab_df_free_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, + &metaslab_df_free_pct, 0, + "The minimum free space, in percent, which must be available in a " + "space map to continue allocations in a first-fit fashion"); + +/* + * Percentage of all cpus that can be used by the metaslab taskq. + */ +extern int metaslab_load_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, + &metaslab_load_pct, 0, + "Percentage of cpus that can be used by the metaslab taskq"); + +/* + * Max number of metaslabs per group to preload. + */ +extern int metaslab_preload_limit; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, + &metaslab_preload_limit, 0, + "Max number of metaslabs per group to preload"); + +/* refcount.c */ +extern int reference_tracking_enable; +SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, + &reference_tracking_enable, 0, + "Track reference holders to refcount_t objects, used mostly by ZFS"); + +/* spa.c */ +extern int zfs_ccw_retry_interval; +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, + &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval (seconds)"); + +extern uint64_t zfs_max_missing_tvds_cachefile; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_cachefile, 0, + "allow importing pools with missing top-level vdevs in cache file"); + +extern uint64_t zfs_max_missing_tvds_scan; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_scan, 0, + "allow importing pools with missing top-level vdevs during scan"); + +/* spa_misc.c */ +extern int zfs_flags; +static int +sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = zfs_flags; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + /* + * ZFS_DEBUG_MODIFY must be enabled prior to boot so all + * arc buffers in the system have the necessary additional + * checksum data. However, it is safe to disable at any + * time. + */ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + val &= ~ZFS_DEBUG_MODIFY; + zfs_flags = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, + sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); + +int +param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_synctime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_synctime_ms = val; + + spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_ziotime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_ziotime_ms = val; + + spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) +{ + char buf[16]; + int rc; + + if (req->newptr == NULL) + strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); + + rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (rc || req->newptr == NULL) + return (rc); + if (strcmp(buf, zfs_deadman_failmode) == 0) + return (0); + if (!strcmp(buf, "wait")) + zfs_deadman_failmode = "wait"; + if (!strcmp(buf, "continue")) + zfs_deadman_failmode = "continue"; + if (!strcmp(buf, "panic")) + zfs_deadman_failmode = "panic"; + + return (-param_set_deadman_failmode_common(buf)); +} + + +/* spacemap.c */ +extern int space_map_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + + +/* vdev.c */ +#ifdef notyet +extern uint64_t zfs_max_auto_ashift; +extern uint64_t zfs_min_auto_ashift; + +static int +sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > ASHIFT_MAX || val < zfs_min_auto_ashift) + return (EINVAL); + + zfs_max_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), + sysctl_vfs_zfs_max_auto_ashift, "QU", + "Max ashift used when optimising for logical -> physical sectors size on " + "new top-level vdevs."); +static int +sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_min_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < ASHIFT_MIN || val > zfs_max_auto_ashift) + return (EINVAL); + + zfs_min_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), + sysctl_vfs_zfs_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdevs."); +#endif + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +extern int zfs_vdev_dtl_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_dtl_sm_blksz, 0, + "Block size for DTL space map. Power of 2 and greater than 4096."); + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +extern int zfs_vdev_standard_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_standard_sm_blksz, 0, + "Block size for standard space map. Power of 2 and greater than 4096."); + +extern int vdev_validate_skip; +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, + &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + + +/* vdev_cache.c */ + +/* vdev_mirror.c */ +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + + +/* vdev_queue.c */ +#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ +extern uint32_t zfs_vdev_ ## name ## _min_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _min_active, 0, \ + "Initial number of I/O requests of type " #name \ + " active for each device"); + +#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ +extern uint32_t zfs_vdev_ ## name ## _max_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \ + &zfs_vdev_ ## name ## _max_active, 0, \ + "Maximum number of I/O requests of type " #name \ + " active for each device"); + + +#undef ZFS_VDEV_QUEUE_KNOB + +extern uint32_t zfs_vdev_max_active; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, + &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device. (LEGACY)"); + +extern int zfs_vdev_def_queue_depth; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, + &zfs_vdev_def_queue_depth, 0, + "Default queue depth for each allocator"); + +/*extern uint64_t zfs_multihost_history; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN, + &zfs_multihost_history, 0, + "Historical staticists for the last N multihost updates");*/ + +#ifdef notyet +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); +#endif + + +/* zio.c */ +#if defined(__LP64__) +int zio_use_uma = 1; +#else +int zio_use_uma = 0; +#endif + +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, + "Use uma(9) for ZIO allocations"); +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, + "Exclude metadata buffers from dumps as well"); + + +int +param_set_arc_long(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_arc_int(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_int(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_slop_shift(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = *(int *)arg1; + + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 1 || val > 31) + return (EINVAL); + + *(int *)arg1 = val; + + return (0); +} diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c new file mode 100644 index 000000000..01851378e --- /dev/null +++ b/module/os/freebsd/zfs/vdev_file.c @@ -0,0 +1,326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_file.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> +#include <sys/abd.h> +#include <sys/stat.h> + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), + minclsyspri, max_ncpus, INT_MAX, 0); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error; + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + if (!S_ISREG(zfa.zfa_mode)) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (SET_ERROR(ENODEV)); + } +#endif + +skip_open: + + error = zfs_file_getattr(vf->vf_file, &zfa); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(zio_t *zio) +{ + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf; + void *buf; + ssize_t resid; + loff_t off; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + vf = vd->vdev_tsd; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_type == ZIO_TYPE_READ) { + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); + abd_return_buf_copy(zio->io_abd, buf, size); + } else { + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + abd_return_buf(zio->io_abd, buf, size); + } + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + vdev_file_io_intr(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC|O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { +#ifdef notyet + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); + + /* XXX FreeBSD has no fallocate routine in file ops */ + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); +#endif + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c new file mode 100644 index 000000000..d87bbbc18 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -0,0 +1,1195 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Portions Copyright (c) 2012 Martin Matuska <[email protected]> + */ + +#include <sys/zfs_context.h> +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bio.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <geom/geom.h> +#include <geom/geom_disk.h> +#include <geom/geom_int.h> + +/* + * Virtual device vector for GEOM. + */ + +static g_attrchanged_t vdev_geom_attrchanged; +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, +}; + +struct consumer_vdev_elem { + SLIST_ENTRY(consumer_vdev_elem) elems; + vdev_t *vd; +}; + +SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); +/* BEGIN CSTYLED */ +_Static_assert(sizeof (((struct g_consumer *)NULL)->private) + == sizeof (struct consumer_priv_t*), + "consumer_priv_t* can't be stored in g_consumer.private"); + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ +static int vdev_geom_bio_flush_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); +/* END CSTYLED */ + +/* Declare local functions */ +static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); + +/* + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev_key; + +static void +vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, + boolean_t do_null_update) +{ + boolean_t needs_update = B_FALSE; + char *physpath; + int error, physpath_len; + + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + if (error == 0) { + char *old_physpath; + + /* g_topology lock ensures that vdev has not been closed */ + g_topology_assert(); + old_physpath = vd->vdev_physpath; + vd->vdev_physpath = spa_strdup(physpath); + + if (old_physpath != NULL) { + needs_update = (strcmp(old_physpath, + vd->vdev_physpath) != 0); + spa_strfree(old_physpath); + } else + needs_update = do_null_update; + } + g_free(physpath); + + /* + * If the physical path changed, update the config. + * Only request an update for previously unset physpaths if + * requested by the caller. + */ + if (needs_update) + spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); + +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vdev_t *vd = elem->vd; + if (strcmp(attr, "GEOM::physpath") == 0) { + vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); + return; + } + } +} + +static void +vdev_geom_resize(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + spa_t *spa; + vdev_t *vd; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vd = elem->vd; + if (vd->vdev_state != VDEV_STATE_HEALTHY) + continue; + spa = vd->vdev_spa; + if (!spa->spa_autoexpand) + continue; + vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); + } +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + // cppcheck-suppress uninitvar + struct consumer_vdev_elem *elem; + + g_topology_assert(); + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + /* Vdev close in progress. Ignore the event. */ + return; + + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + // cppcheck-suppress All + SLIST_FOREACH(elem, priv, elems) { + // cppcheck-suppress uninitvar + vdev_t *vd = elem->vd; + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + } +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + + if (sanity) { + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible sectorsize %d\n", + pp->name, pp->sectorsize); + return (NULL); + } else if (pp->mediasize < SPA_MINDEVSIZE) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible mediasize %ju\n", + pp->name, pp->mediasize); + return (NULL); + } + } + + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + if (strcmp(gp->name, "zfs::vdev") != 0) + continue; + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + gp->attrchanged = vdev_geom_attrchanged; + gp->resize = vdev_geom_resize; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + return (NULL); + } + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + + if (vd != NULL) + vd->vdev_tsd = cp; + + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + return (cp); +} + +static void +vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) +{ + struct g_geom *gp; + + g_topology_assert(); + + ZFS_LOG(1, "Detaching from %s.", + cp->provider && cp->provider->name ? cp->provider->name : "NULL"); + + gp = cp->geom; + if (open_for_read) + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + if (cp->provider != NULL) { + ZFS_LOG(1, "Destroying consumer for %s.", + cp->provider->name ? cp->provider->name : "NULL"); + g_detach(cp); + } + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_close_locked(vdev_t *vd) +{ + struct g_consumer *cp; + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem, *elem_temp; + + g_topology_assert(); + + cp = vd->vdev_tsd; + vd->vdev_delayed_close = B_FALSE; + if (cp == NULL) + return; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); + priv = (struct consumer_priv_t *)&cp->private; + vd->vdev_tsd = NULL; + SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { + if (elem->vd == vd) { + SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); + g_free(elem); + } + } + + vdev_geom_detach(cp, B_TRUE); +} + +/* + * Issue one or more bios to the vdev in parallel + * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO + * operation is described by parallel entries from each array. There may be + * more bios actually issued than entries in the array + */ +static void +vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, + off_t *sizes, int *errors, int ncmds) +{ + struct bio **bios; + uint8_t *p; + off_t off, maxio, s, end; + int i, n_bios, j; + size_t bios_size; + + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); + n_bios = 0; + + /* How many bios are required for all commands ? */ + for (i = 0; i < ncmds; i++) + n_bios += (sizes[i] + maxio - 1) / maxio; + + /* Allocate memory for the bios */ + bios_size = n_bios * sizeof (struct bio *); + bios = kmem_zalloc(bios_size, KM_SLEEP); + + /* Prepare and issue all of the bios */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + p = datas[i]; + s = sizes[i]; + end = off + s; + ASSERT((off % cp->provider->sectorsize) == 0); + ASSERT((s % cp->provider->sectorsize) == 0); + + for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { + bios[j] = g_alloc_bio(); + bios[j]->bio_cmd = cmds[i]; + bios[j]->bio_done = NULL; + bios[j]->bio_offset = off; + bios[j]->bio_length = MIN(s, maxio); + bios[j]->bio_data = (caddr_t)p; + g_io_request(bios[j], cp); + } + } + ASSERT(j == n_bios); + + /* Wait for all of the bios to complete, and clean them up */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + s = sizes[i]; + end = off + s; + + for (; off < end; off += maxio, s -= maxio, j++) { + errors[i] = biowait(bios[j], "vdev_geom_io") || + errors[i]; + g_destroy_bio(bios[j]); + } + } + kmem_free(bios, bios_size); +} + +/* + * Read the vdev config from a device. Return the number of valid labels that + * were found. The vdev config will be returned in config if and only if at + * least one valid label was found. + */ +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) +{ + struct g_provider *pp; + nvlist_t *config; + vdev_phys_t *vdev_lists[VDEV_LABELS]; + char *buf; + size_t buflen; + uint64_t psize, state, txg; + off_t offsets[VDEV_LABELS]; + off_t size; + off_t sizes[VDEV_LABELS]; + int cmds[VDEV_LABELS]; + int errors[VDEV_LABELS]; + int l, nlabels; + + g_topology_assert_not(); + + pp = cp->provider; + ZFS_LOG(1, "Reading config from %s...", pp->name); + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + + size = sizeof (*vdev_lists[0]) + pp->sectorsize - + ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; + + buflen = sizeof (vdev_lists[0]->vp_nvlist); + + /* Create all of the IO requests */ + for (l = 0; l < VDEV_LABELS; l++) { + cmds[l] = BIO_READ; + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + sizes[l] = size; + errors[l] = 0; + ASSERT(offsets[l] % pp->sectorsize == 0); + } + + /* Issue the IO requests */ + vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, + VDEV_LABELS); + + /* Parse the labels */ + config = *configp = NULL; + nlabels = 0; + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(config); + continue; + } + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(config); + continue; + } + + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + return (nlabels); +} + +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof (void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t *known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid; + uint64_t id, txg, known_txg; + char *pname; + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); +} + +int +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t pool_guid; + int nlabels; + + DROP_GIANT(); + g_topology_lock(); + + *configs = NULL; + *count = 0; + pool_guid = 0; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + zcp = vdev_geom_attach(pp, NULL, B_TRUE); + if (zcp == NULL) + continue; + g_topology_unlock(); + nlabels = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_detach(zcp, B_TRUE); + if (nlabels == 0) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); + } + } + } + g_topology_unlock(); + PICKUP_GIANT(); + + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH = 0, /* No matching labels found */ + TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ + ZERO_MATCH = 1, /* Should never be returned */ + ONE_MATCH = 2, /* 1 label matching the vdev_guid */ + TWO_MATCH = 3, /* 2 label matching the vdev_guid */ + THREE_MATCH = 4, /* 3 label matching the vdev_guid */ + FULL_MATCH = 5 /* all labels match the vdev_guid */ +}; + +static enum match +vdev_attach_ok(vdev_t *vd, struct g_provider *pp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; + int nlabels; + + cp = vdev_geom_attach(pp, NULL, B_TRUE); + if (cp == NULL) { + ZFS_LOG(1, "Unable to attach tasting instance to %s.", + pp->name); + return (NO_MATCH); + } + g_topology_unlock(); + nlabels = vdev_geom_read_config(cp, &config); + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + if (nlabels == 0) { + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (ZERO_MATCH + nlabels); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOPGUID_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static struct g_consumer * +vdev_geom_attach_by_guids(vdev_t *vd) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp, *best_pp; + struct g_consumer *cp; + const char *vdpath; + enum match match, best_match; + + g_topology_assert(); + + vdpath = vd->vdev_path + sizeof ("/dev/") - 1; + cp = NULL; + best_pp = NULL; + best_match = NO_MATCH; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + match = vdev_attach_ok(vd, pp); + if (match > best_match) { + best_match = match; + best_pp = pp; + } else if (match == best_match) { + if (strcmp(pp->name, vdpath) == 0) { + best_pp = pp; + } + } + if (match == FULL_MATCH) + goto out; + } + } + } + +out: + if (best_pp) { + cp = vdev_geom_attach(best_pp, vd, B_TRUE); + if (cp == NULL) { + printf("ZFS WARNING: Unable to attach to %s.\n", + best_pp->name); + } + } + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_guids(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + g_topology_assert(); + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + cp = vdev_geom_attach_by_guids(vd); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, cp->provider->name); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_path(vdev_t *vd, int check_guid) +{ + struct g_provider *pp; + struct g_consumer *cp; + + g_topology_assert(); + + cp = NULL; + pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); + if (pp != NULL) { + ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) + cp = vdev_geom_attach(pp, vd, B_FALSE); + } + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift) +{ + struct g_provider *pp; + struct g_consumer *cp; + int error, has_trim; + uint16_t rate; + + /* + * Set the TLS to indicate downstack that we + * should not access zvols + */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if ((cp = vd->vdev_tsd) != NULL) { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + + DROP_GIANT(); + g_topology_lock(); + error = 0; + + if (vd->vdev_spa->spa_is_splitting || + ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user knows what he/she is doing and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + cp = vdev_geom_open_by_path(vd, 0); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right GUIDs. + */ + cp = vdev_geom_open_by_guids(vd); + } + } + + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); + + if (cp == NULL) { + ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); + error = ENOENT; + } else { + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + int spamode; + + priv = (struct consumer_priv_t *)&cp->private; + if (cp->private == NULL) + SLIST_INIT(priv); + elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); + elem->vd = vd; + SLIST_INSERT_HEAD(priv, elem, elems); + + spamode = spa_mode(vd->vdev_spa); + if (cp->provider->sectorsize > VDEV_PAD_SIZE || + !ISP2(cp->provider->sectorsize)) { + ZFS_LOG(1, "Provider %s has unsupported sectorsize.", + cp->provider->name); + + vdev_geom_close_locked(vd); + error = EINVAL; + cp = NULL; + } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { + int i; + + for (i = 0; i < 5; i++) { + error = g_access(cp, 0, 1, 0); + if (error == 0) + break; + g_topology_unlock(); + tsleep(vd, 0, "vdev", hz / 2); + g_topology_lock(); + } + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for " + "writing (error=%d).\n", + cp->provider->name, error); + vdev_geom_close_locked(vd); + cp = NULL; + } + } + } + + /* Fetch initial physical path information for this device. */ + if (cp != NULL) { + vdev_geom_attrchanged(cp, "GEOM::physpath"); + + /* Set other GEOM characteristics */ + vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); + } + + g_topology_unlock(); + PICKUP_GIANT(); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", + error); + return (error); + } +skip_open: + pp = cp->provider; + + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size and preferred + * transfer size. + */ + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; +#ifdef notyet + if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && + pp->stripesize <= (1 << ASHIFT_MAX) && pp->stripeoffset == 0) + *physical_ashift = highbit(pp->stripesize) - 1; +#endif + /* + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Inform the ZIO pipeline that we are non-rotational. */ + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0 && rate == DISK_RR_NON_ROTATING) + vd->vdev_nonrot = B_TRUE; + else + vd->vdev_nonrot = B_FALSE; + + /* Set when device reports it supports TRIM. */ + error = g_getattr("GEOM::candelete", cp, &has_trim); + vd->vdev_has_trim = (error == 0 && has_trim); + + /* Set when device reports it supports secure TRIM. */ + /* unavailable on FreeBSD */ + vd->vdev_has_securetrim = B_FALSE; + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + struct g_consumer *cp; + + cp = vd->vdev_tsd; + + DROP_GIANT(); + g_topology_lock(); + + if (!vd->vdev_reopening || + (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || + (cp->provider != NULL && cp->provider->error != 0)))) + vdev_geom_close_locked(vd); + + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_t *vd; + zio_t *zio; + + zio = bp->bio_caller1; + vd = zio->io_vd; + zio->io_error = bp->bio_error; + if (zio->io_error == 0 && bp->bio_resid != 0) + zio->io_error = SET_ERROR(EIO); + + switch (zio->io_error) { + case ENOTSUP: + /* + * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know + * that future attempts will never succeed. In this case + * we set a persistent flag so that we don't bother with + * requests in the future. + */ + switch (bp->bio_cmd) { + case BIO_FLUSH: + vd->vdev_nowritecache = B_TRUE; + break; + case BIO_DELETE: + break; + } + break; + case ENXIO: + if (!vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, + SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } + break; + } + + /* + * We have to split bio freeing into two parts, because the ABD code + * cannot be called in this context and vdev_op_io_done is not called + * for ZIO_TYPE_IOCTL zio-s. + */ + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + g_destroy_bio(bp); + zio->io_bio = NULL; + } + zio_delay_interrupt(zio); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + struct g_consumer *cp; + struct bio *bp; + + vd = zio->io_vd; + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || + vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + } + + zio_execute(zio); + return; + case ZIO_TYPE_TRIM: + if (!vdev_geom_bio_delete_disable) { + goto sendreq; + } + zio_execute(zio); + return; + default: + ; + /* PASSTHROUGH --- placate compiler */ + } +sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM || + zio->io_type == ZIO_TYPE_IOCTL); + + cp = vd->vdev_tsd; + if (cp == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + if (zio->io_type == ZIO_TYPE_READ) { + bp->bio_cmd = BIO_READ; + bp->bio_data = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->bio_cmd = BIO_WRITE; + bp->bio_data = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + break; + case ZIO_TYPE_TRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + default: + panic("invalid zio->io_type: %d\n", zio->io_type); + } + bp->bio_done = vdev_geom_io_intr; + zio->io_bio = bp; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ + struct bio *bp = zio->io_bio; + + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + ASSERT(bp == NULL); + return; + } + + if (bp == NULL) { + ASSERT3S(zio->io_error, ==, ENXIO); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); + else + abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); + + g_destroy_bio(bp); + zio->io_bio = NULL; +} + +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + +vdev_ops_t vdev_disk_ops = { + vdev_geom_open, + vdev_geom_close, + vdev_default_asize, + vdev_geom_io_start, + vdev_geom_io_done, + NULL, + NULL, + vdev_geom_hold, + vdev_geom_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c new file mode 100644 index 000000000..e734a2af8 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/vdev.h> +#include <sys/vdev_os.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/zio.h> +#include <sys/dsl_scan.h> +#include <sys/abd.h> +#include <sys/fs/zfs.h> + +int +vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) +{ + spa_t *spa = vd->vdev_spa; + zio_t *zio; + abd_t *pad2; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + + if (size > VDEV_PAD_SIZE) + return (EINVAL); + + if (!vd->vdev_ops->vdev_op_leaf) + return (ENODEV); + if (vdev_is_dead(vd)) + return (ENXIO); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); + abd_copy_from_buf(pad2, buf, size); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + vdev_label_write(zio, vd, 0, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(pad2); + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c new file mode 100644 index 000000000..c11e16437 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -0,0 +1,2738 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/policy.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_quota.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <acl/acl_common.h> + + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize; + int acl_count; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = (vtype == VDIR); + boolean_t isreg = (vtype == VREG); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(vtype, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + if ((flag & IS_ROOT_NODE) == 0) { + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + } else + ASSERT(dzp->z_vnode == NULL); + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); +#ifdef __FreeBSD_kernel__ + gid = acl_ids->z_fgid = dzp->z_gid; +#else + gid = crgetgid(cr); +#endif + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY0(zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (SET_ERROR(EPERM)); + } + + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != UID_NOBODY && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (ZTOV(zp)->v_type != VDIR) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +/* + * Check if VEXEC is allowed. + * + * This routine is based on zfs_fastaccesschk_execute which has slowpath + * calling zfs_zaccess. This would be incorrect on FreeBSD (see + * zfs_freebsd_access for the difference). Thus this variant let's the + * caller handle the slowpath (if necessary). + * + * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED. + * + * Safe access to znode_t is provided by the vnode lock. + */ +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (1); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + return (1); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) + return (0); + + mutex_enter(&zdp->z_acl_lock); + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { + goto out_slow; + } + + if (uid == zdp->z_uid) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + goto out; + } else { + goto out_slow; + } + } + if (groupmember(zdp->z_gid, cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + goto out; + } else { + goto out_slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + goto out; + } + } +out: + mutex_exit(&zdp->z_acl_lock); + return (0); +out_slow: + mutex_exit(&zdp->z_acl_lock); + return (1); +} + + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); + +#ifdef __FreeBSD_kernel__ + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + if ((error = zfs_zget(zp->z_zfsvfs, + parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + vnode_t *check_vp = ZTOV(check_zp); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, check_vp, owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(check_vp, cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(check_vp, cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(check_vp, cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(check_vp, cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t available_perms, cred_t *cr) +{ + int error; + uid_t downer; + + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t available_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) { + /* XXXPJD: s/dzp/zp/ ? */ + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); + } + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + + if (dzp_error != 0 && !dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; + + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c new file mode 100644 index 000000000..ed6652c3b --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -0,0 +1,1345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include <sys/dirent.h> +#include <sys/zfs_context.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/namei.h> +#include <sys/stat.h> +#include <sys/dmu.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_deleg.h> +#include <sys/mount.h> +#include <sys/zap.h> +#include <sys/sysproto.h> + +#include "zfs_namecheck.h" + +#include <sys/kernel.h> + +/* Common access mode for all virtual directories under the ctldir */ +const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + +/* + * "Synthetic" filesystem implementation. + */ + +/* + * Assert that A implies B. + */ +#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); + +static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); + +typedef struct sfs_node { + char sn_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t sn_parent_id; + uint64_t sn_id; +} sfs_node_t; + +/* + * Check the parent's ID as well as the node's to account for a chance + * that IDs originating from different domains (snapshot IDs, artifical + * IDs, znode IDs) may clash. + */ +static int +sfs_compare_ids(struct vnode *vp, void *arg) +{ + sfs_node_t *n1 = vp->v_data; + sfs_node_t *n2 = arg; + bool equal; + + equal = n1->sn_id == n2->sn_id && + n1->sn_parent_id == n2->sn_parent_id; + + /* Zero means equality. */ + return (!equal); +} + +static int +sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + sfs_node_t search; + int err; + + search.sn_id = id; + search.sn_parent_id = parent_id; + err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, &search); + return (err); +} + +static int +sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + int err; + + KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); + err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, vp->v_data); + return (err); +} + +static void +sfs_vnode_remove(struct vnode *vp) +{ + vfs_hash_remove(vp); +} + +typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); + +static int +sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, + const char *tag, struct vop_vector *vops, + sfs_vnode_setup_fn setup, void *arg, + struct vnode **vpp) +{ + struct vnode *vp; + int error; + + error = sfs_vnode_get(mp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + /* Allocate a new vnode/inode. */ + error = getnewvnode(tag, mp, vops, &vp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + /* + * Exclusively lock the vnode vnode while it's being constructed. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + error = insmntque(vp, mp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + setup(vp, arg); + + error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + *vpp = vp; + return (0); +} + +static void +sfs_print_node(sfs_node_t *node) +{ + printf("\tname = %s\n", node->sn_name); + printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); + printf("\tid = %ju\n", (uintmax_t)node->sn_id); +} + +static sfs_node_t * +sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) +{ + struct sfs_node *node; + + KASSERT(strlen(name) < sizeof (node->sn_name), + ("sfs node name is too long")); + KASSERT(size >= sizeof (*node), ("sfs node size is too small")); + node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); + strlcpy(node->sn_name, name, sizeof (node->sn_name)); + node->sn_parent_id = parent_id; + node->sn_id = id; + + return (node); +} + +static void +sfs_destroy_node(sfs_node_t *node) +{ + free(node, M_SFSNODES); +} + +static void * +sfs_reclaim_vnode(vnode_t *vp) +{ + void *data; + + sfs_vnode_remove(vp); + data = vp->v_data; + vp->v_data = NULL; + return (data); +} + +static int +sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, + uio_t *uio, off_t *offp) +{ + struct dirent entry; + int error; + + /* Reset ncookies for subsequent use of vfs_read_dirent. */ + if (ap->a_ncookies != NULL) + *ap->a_ncookies = 0; + + if (uio->uio_resid < sizeof (entry)) + return (SET_ERROR(EINVAL)); + + if (uio->uio_offset < 0) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == 0) { + entry.d_fileno = id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '\0'; + entry.d_namlen = 1; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (uio->uio_offset < sizeof (entry)) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == sizeof (entry)) { + entry.d_fileno = parent_id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '.'; + entry.d_name[2] = '\0'; + entry.d_namlen = 2; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (offp != NULL) + *offp = 2 * sizeof (entry); + return (0); +} + + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/<snap> objectid(snap) + */ +#define ZFSCTL_INO_SNAP(id) (id) + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares_dir; + +void +zfsctl_init(void) +{ +} + +void +zfsctl_fini(void) +{ +} + +boolean_t +zfsctl_is_node(vnode_t *vp) +{ + return (vn_matchops(vp, zfsctl_ops_root) || + vn_matchops(vp, zfsctl_ops_snapdir) || + vn_matchops(vp, zfsctl_ops_snapshot) || + vn_matchops(vp, zfsctl_ops_shares_dir)); + +} + +typedef struct zfsctl_root { + sfs_node_t node; + sfs_node_t *snapdir; + timestruc_t cmtime; +} zfsctl_root_t; + + +/* + * Create the '.zfs' directory. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + zfsctl_root_t *dot_zfs; + sfs_node_t *snapdir; + vnode_t *rvp; + uint64_t crtime[2]; + + ASSERT(zfsvfs->z_ctldir == NULL); + + snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, + ZFSCTL_INO_SNAPDIR); + dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0, + ZFSCTL_INO_ROOT); + dot_zfs->snapdir = snapdir; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); + vput(rvp); + + zfsvfs->z_ctldir = dot_zfs; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * The nodes must not have any associated vnodes by now as they should be + * vflush-ed. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + sfs_destroy_node(zfsvfs->z_ctldir->snapdir); + sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +static int +zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + return (VFS_ROOT(mp, flags, vpp)); +} + +static void +zfsctl_common_vnode_setup(vnode_t *vp, void *arg) +{ + ASSERT_VOP_ELOCKED(vp, __func__); + + /* We support shared locking. */ + VN_LOCK_ASHARE(vp); + vp->v_type = VDIR; + vp->v_data = arg; +} + +static int +zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir; + err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, + zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +static int +zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir; + err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", + &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +int +zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) +{ + int error; + + error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); + return (error); +} + +/* + * Common open routine. Disallow any write access. + */ +static int +zfsctl_common_open(struct vop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (SET_ERROR(EACCES)); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vop_close_args *ap) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +static int +zfsctl_common_access(struct vop_access_args *ap) +{ + accmode_t accmode = ap->a_accmode; + + if (accmode & VWRITE) + return (SET_ERROR(EACCES)); + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + timestruc_t now; + sfs_node_t *node; + + node = vp->v_data; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purely virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vn_fsid(vp, vap); + vap->va_mode = zfsctl_ctldir_mode; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + vap->va_nodeid = node->sn_id; + + /* At least '.' and '..'. */ + vap->va_nlink = 2; +} + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfsctl_common_fid(struct vop_fid_args *ap) +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + sfs_node_t *node = vp->v_data; + uint64_t object = node->sn_id; + zfid_short_t *zfid; + int i; + + zfid = (zfid_short_t *)fidp; + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs nodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfsctl_common_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + + (void) sfs_reclaim_vnode(vp); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_print_args { + struct vnode *a_vp; +}; +#endif + +static int +zfsctl_common_print(struct vop_print_args *ap) +{ + sfs_print_node(ap->a_vp->v_data); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +/* + * Get root directory attributes. + */ +static int +zfsctl_root_getattr(struct vop_getattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsctl_root_t *node = vp->v_data; + + zfsctl_common_getattr(vp, vap); + vap->va_ctime = node->cmtime; + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + vap->va_nlink += 1; /* snapdir */ + vap->va_size = vap->va_nlink; + return (0); +} + +/* + * When we lookup "." we still can be asked to lock it + * differently, can't we? + */ +int +zfsctl_relock_dot(vnode_t *dvp, int ltype) +{ + vref(dvp); + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* Relock for the "." case may left us with reclaimed vnode. */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(struct vop_lookup_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + } else if ((flags & ISDOTDOT) != 0) { + err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, + lkflags, vpp); + } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { + err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); + } else { + err = SET_ERROR(ENOENT); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_root_readdir(struct vop_readdir_args *ap) +{ + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_root_t *node = vp->v_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + if (uio->uio_offset != dots_offset) + return (SET_ERROR(EINVAL)); + + CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name)); + entry.d_fileno = node->snapdir->sn_id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, node->snapdir->sn_name); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + if (eofp != NULL) + *eofp = 1; + return (0); +} + +static int +zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) +{ + static const char dotzfs_name[4] = ".zfs"; + vnode_t *dvp; + int error; + + if (*ap->a_buflen < sizeof (dotzfs_name)) + return (SET_ERROR(ENOMEM)); + + error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, + LK_SHARED, &dvp); + if (error != 0) + return (SET_ERROR(error)); + + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= sizeof (dotzfs_name); + bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); + return (0); +} + +static int +zfsctl_common_pathconf(struct vop_pathconf_args *ap) +{ + /* + * We care about ACL variables so that user land utilities like ls + * can display them correctly. Since the ctldir's st_dev is set to be + * the same as the parent dataset, we must support all variables that + * it supports. + */ + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + return (0); + + case _PC_MIN_HOLE_SIZE: + *ap->a_retval = (int)SPA_MINBLOCKSIZE; + return (0); + + case _PC_ACL_EXTENDED: + *ap->a_retval = 0; + return (0); + + case _PC_ACL_NFS4: + *ap->a_retval = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *ap->a_retval = ACL_MAX_ENTRIES; + return (0); + + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * Returns a trivial ACL + */ +int +zfsctl_common_getacl(struct vop_getacl_args *ap) +{ + int i; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); + /* + * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify + * attributes. That is not the case for the ctldir, so we must clear + * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs + * aren't supported by the ctldir. + */ + for (i = 0; i < ap->a_aclp->acl_cnt; i++) { + struct acl_entry *entry; + entry = &(ap->a_aclp->acl_entry[i]); + entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | + ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | + ACL_READ_NAMED_ATTRS); + } + + return (0); +} + +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_root_readdir, + .vop_lookup = zfsctl_root_lookup, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_vptocnp = zfsctl_root_vptocnp, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root); + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + int err; + + err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); + return (err); +} + +/* + * Given a vnode get a root vnode of a filesystem mounted on top of + * the vnode, if any. The root vnode is referenced and locked. + * If no filesystem is mounted then the orinal vnode remains referenced + * and locked. If any error happens the orinal vnode is unlocked and + * released. + */ +static int +zfsctl_mounted_here(vnode_t **vpp, int flags) +{ + struct mount *mp; + int err; + + ASSERT_VOP_LOCKED(*vpp, __func__); + ASSERT3S((*vpp)->v_type, ==, VDIR); + + if ((mp = (*vpp)->v_mountedhere) != NULL) { + err = vfs_busy(mp, 0); + KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); + KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); + vput(*vpp); + err = VFS_ROOT(mp, flags, vpp); + vfs_unbusy(mp); + return (err); + } + return (EJUSTRETURN); +} + +typedef struct { + const char *snap_name; + uint64_t snap_id; +} snapshot_setup_arg_t; + +static void +zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) +{ + snapshot_setup_arg_t *ssa = arg; + sfs_node_t *node; + + ASSERT_VOP_ELOCKED(vp, __func__); + + node = sfs_alloc_node(sizeof (sfs_node_t), + ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); + zfsctl_common_vnode_setup(vp, node); + + /* We have to support recursive locking. */ + VN_LOCK_AREC(vp); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + * There are four possibilities: + * - the snapshot node and vnode do not exist + * - the snapshot vnode is covered by the mounted snapshot + * - the snapshot vnode is not covered yet, the mount operation is in progress + * - the snapshot vnode is not covered, because the snapshot has been unmounted + * The last two states are transient and should be relatively short-lived. + */ +int +zfsctl_snapdir_lookup(struct vop_lookup_args *ap) +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char name[NAME_MAX + 1]; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + char *mountpoint; + size_t mountpoint_len; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + uint64_t snap_id; + int nameiop = cnp->cn_nameiop; + int lkflags = cnp->cn_lkflags; + int flags = cnp->cn_flags; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + return (err); + } + if (flags & ISDOTDOT) { + err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, + vpp); + return (err); + } + + if (cnp->cn_namelen >= sizeof (name)) + return (SET_ERROR(ENAMETOOLONG)); + + strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + err = zfsctl_snapshot_lookup(dvp, name, &snap_id); + if (err != 0) + return (SET_ERROR(ENOENT)); + + for (;;) { + snapshot_setup_arg_t ssa; + + ssa.snap_name = name; + ssa.snap_id = snap_id; + err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, + snap_id, "zfs", &zfsctl_ops_snapshot, + zfsctl_snapshot_vnode_setup, &ssa, vpp); + if (err != 0) + return (err); + + /* Check if a new vnode has just been created. */ + if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) + break; + + /* + * Check if a snapshot is already mounted on top of the vnode. + */ + err = zfsctl_mounted_here(vpp, lkflags); + if (err != EJUSTRETURN) + return (err); + + /* + * If the vnode is not covered, then either the mount operation + * is in progress or the snapshot has already been unmounted + * but the vnode hasn't been inactivated and reclaimed yet. + * We can try to re-use the vnode in the latter case. + */ + VI_LOCK(*vpp); + if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + /* + * Upgrade to exclusive lock in order to: + * - avoid race conditions + * - satisfy the contract of mount_snapshot() + */ + err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + if (err == 0) + break; + } else { + VI_UNLOCK(*vpp); + } + + /* + * In this state we can loop on uncontested locks and starve + * the thread doing the lengthy, non-trivial mount operation. + * So, yield to prevent that from happening. + */ + vput(*vpp); + kern_yield(PRI_USER); + } + + VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname)); + + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, name); + + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + kmem_free(mountpoint, mountpoint_len); + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/<snapname>. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/<snapname> accessible over NFS + * without requiring manual mounts of <snapname>. + */ + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + + /* Clear the root flag (set via VFS_ROOT) as well. */ + (*vpp)->v_vflag &= ~VV_ROOT; + } + + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_snapdir_readdir(struct vop_readdir_args *ap) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + + ZFS_ENTER(zfsvfs); + for (;;) { + uint64_t cookie; + uint64_t id; + + cookie = uio->uio_offset - dots_offset; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + entry.d_fileno = id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, snapname); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(error)); + } + uio->uio_offset = cookie + dots_offset; + } + /* NOTREACHED */ +} + +static int +zfsctl_snapdir_getattr(struct vop_getattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); + uint64_t snap_count; + int err; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (err != 0) { + ZFS_EXIT(zfsvfs); + return (err); + } + vap->va_nlink += snap_count; + } + vap->va_size = vap->va_nlink; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_snapdir_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir); + + +static int +zfsctl_snapshot_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + VERIFY(vrecycle(vp) == 1); + return (0); +} + +static int +zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + void *data = vp->v_data; + + sfs_reclaim_vnode(vp); + sfs_destroy_node(data); + return (0); +} + +static int +zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) +{ + struct mount *mp; + vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + size_t len; + int locked; + int error; + + vp = ap->a_vp; + node = vp->v_data; + len = strlen(node->sn_name); + if (*ap->a_buflen < len) + return (SET_ERROR(ENOMEM)); + + /* + * Prevent unmounting of the snapshot while the vnode lock + * is not held. That is not strictly required, but allows + * us to assert that an uncovered snapshot vnode is never + * "leaked". + */ + mp = vp->v_mountedhere; + if (mp == NULL) + return (SET_ERROR(ENOENT)); + error = vfs_busy(mp, 0); + KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); + + /* + * We can vput the vnode as we can now depend on the reference owned + * by the busied mp. But we also need to hold the vnode, because + * the reference may go after vfs_unbusy() which has to be called + * before we can lock the vnode again. + */ + locked = VOP_ISLOCKED(vp); +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(vp); +#else + vhold(vp); +#endif + vput(vp); + + /* Look up .zfs/snapshot, our parent. */ + error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); + if (error == 0) { + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= len; + bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); + } + vfs_unbusy(mp); +#if __FreeBSD_version >= 1300045 + vget_finish(vp, locked | LK_RETRY, vs); +#else + vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); +#endif + return (error); +} + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = NULL, /* ensure very restricted access */ + .vop_inactive = zfsctl_snapshot_inactive, +#if __FreeBSD_version >= 1300045 + .vop_need_inactive = vop_stdneed_inactive, +#endif + .vop_reclaim = zfsctl_snapshot_reclaim, + .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_lock1 = vop_stdlock, + .vop_unlock = vop_stdunlock, + .vop_islocked = vop_stdislocked, + .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ + .vop_print = zfsctl_common_print, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot); + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs __unused = vfsp->vfs_data; + vnode_t *vp; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + *zfsvfsp = NULL; + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, objsetid, &vp); + if (error == 0 && vp != NULL) { + /* + * XXX Probably need to at least reference, if not busy, the mp. + */ + if (vp->v_mountedhere != NULL) + *zfsvfsp = vp->v_mountedhere->mnt_data; + vput(vp); + } + if (*zfsvfsp == NULL) + return (SET_ERROR(EINVAL)); + return (0); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct mount *mp; + vnode_t *vp; + uint64_t cookie; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + + cookie = 0; + for (;;) { + uint64_t id; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } + + for (;;) { + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, id, &vp); + if (error != 0 || vp == NULL) + break; + + mp = vp->v_mountedhere; + + /* + * v_mountedhere being NULL means that the + * (uncovered) vnode is in a transient state + * (mounting or unmounting), so loop until it + * settles down. + */ + if (mp != NULL) + break; + vput(vp); + } + if (error != 0) + break; + if (vp == NULL) + continue; /* no mountpoint, nothing to do */ + + /* + * The mount-point vnode is kept locked to avoid spurious EBUSY + * from a concurrent umount. + * The vnode lock must have recursive locking enabled. + */ + vfs_ref(mp); + error = dounmount(mp, fflags, curthread); + KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, + ("extra references after unmount")); + vput(vp); + if (error != 0) + break; + } + KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, + ("force unmounting failed")); + return (error); +} + +int +zfsctl_snapshot_unmount(char *snapname, int flags __unused) +{ + vfs_t *vfsp = NULL; + zfsvfs_t *zfsvfs = NULL; + + if (strchr(snapname, '@') == NULL) + return (0); + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return (0); + } + vfsp = zfsvfs->z_vfs; + + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + vfs_ref(vfsp); + vfs_unbusy(vfsp); + return (dounmount(vfsp, MS_FORCE, curthread)); +} diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c new file mode 100644 index 000000000..2f5962b25 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/kstat.h> + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size = 0; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +kstat_t *zfs_dbgmsg_kstat; + +/* + * Internal ZFS debug messages are enabled by default. + * + * # Print debug messages + * cat /proc/spl/kstat/zfs/dbgmsg + * + * # Disable the kernel debug message log. + * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable + * + * # Clear the kernel debug message log. + * echo 0 >/proc/spl/kstat/zfs/dbgmsg + */ +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_headers(char *buf, size_t size) +{ + (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message"); + + return (0); +} + +static int +zfs_dbgmsg_data(char *buf, size_t size, void *data) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; + + (void) snprintf(buf, size, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + + return (0); +} + +static void * +zfs_dbgmsg_addr(kstat_t *ksp, loff_t n) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + if (n == 0) + ksp->ks_private = list_head(&zfs_dbgmsgs); + else if (zdm) + ksp->ks_private = list_next(&zfs_dbgmsgs, zdm); + + return (ksp->ks_private); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + zfs_dbgmsg_t *zdm; + int size; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + while (zfs_dbgmsg_size > max_size) { + zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + zfs_dbgmsg_purge(0); + + return (0); +} + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (zfs_dbgmsg_kstat) { + zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock; + zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX; + zfs_dbgmsg_kstat->ks_private = NULL; + zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update; + kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers, + zfs_dbgmsg_data, zfs_dbgmsg_addr); + kstat_install(zfs_dbgmsg_kstat); + } +} + +void +zfs_dbgmsg_fini(void) +{ + if (zfs_dbgmsg_kstat) + kstat_delete(zfs_dbgmsg_kstat); + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL + mutex_enter(&zfs_dbgmsgs_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs_lock); + mutex_destroy(&zfs_dbgmsgs_lock); +#endif +} + +void +__zfs_dbgmsg(char *buf) +{ + zfs_dbgmsg_t *zdm; + int size; + + DTRACE_PROBE1(zfs__dbgmsg, char *, buf); + + size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +#ifdef _KERNEL +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + size_t size; + char *buf; + char *nl; + int i; + + size = 1024; + buf = kmem_alloc(size, KM_SLEEP); + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + /* + * Get rid of trailing newline. + */ + nl = strrchr(buf, '\n'); + if (nl != NULL) + *nl = '\0'; + + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +#else + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} +#endif /* _KERNEL */ + +#ifdef _KERNEL +module_param(zfs_dbgmsg_enable, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); + +module_param(zfs_dbgmsg_maxsize, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); +#endif diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c new file mode 100644 index 000000000..e93b3e2cf --- /dev/null +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -0,0 +1,961 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/extdirent.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/sunddi.h> +#include <sys/random.h> +#include <sys/policy.h> +#include <sys/condvar.h> +#include <sys/callb.h> +#include <sys/smp.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/fs/zfs.h> +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/atomic.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/sa.h> +#include <sys/zfs_sa.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> + +/* + * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + matchtype_t mt, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, NULL, 0, NULL); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + + *zpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect how we perform zap lookups. + * + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if (zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. + */ + + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + return (error); + } + } else { + if (flag & ZNEW) { + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, &zp); + if (error) + return (error); + ASSERT(!zp->z_unlinked); + *zpp = zp; + } + + return (0); +} + +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + uint64_t parent; + int error; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + + if (dzp->z_unlinked) + return (ENOENT); + + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + return (error); +} + +int +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) +{ + zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; + znode_t *zp = NULL; + int error = 0; + +#ifdef ZFS_DEBUG + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); +#endif + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + error = zfs_dd_lookup(dzp, &zp); + if (error == 0) + *zpp = zp; + } else { + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); + if (error == 0) { + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; + } + } + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_unlinked); + ASSERT(zp->z_links == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + dmu_tx_t *tx; + int error; + + /* + * Interate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); + + /* + * Due to changes in zfs_rmnode we need to make sure the + * link count is set to zero here. + */ + if (zp->z_links != 0) { + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + vput(ZTOV(zp)); + continue; + } + zp->z_links = 0; + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &zp->z_links, sizeof (zp->z_links), tx)); + dmu_tx_commit(tx); + } + + zp->z_unlinked = B_TRUE; + vput(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + vput(ZTOV(xzp)); + skipped += 1; + continue; + } + + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + vput(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +extern taskq_t *zfsvfs_taskq; + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + uint64_t count; + int error; + + ASSERT(zp->z_links == 0); + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } else { + /* + * Free up all the data in the file. We don't do this for + * XATTR directories because we need truncate and remove to be + * in the same tx, like in zfs_znode_delete(). Otherwise, if + * we crash here we'll end up with an inconsistent truncated + * zap object in the delete queue. Note a truncated file is + * harmless since it only contains user data. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error) + xattr_obj = 0; + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xattr_obj) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* + * FreeBSD's implemention of zfs_zget requires a vnode to back it. + * This means that we could end up calling into getnewvnode while + * calling zfs_rmnode as a result of a prior call to getnewvnode + * trying to clear vnodes out of the cache. If this repeats we can + * recurse enough that we overflow our stack. To avoid this, we + * avoid calling zfs_zget on the xattr znode and instead simply add + * it to the unlinked set and schedule a call to zfs_unlinked_drain. + */ + if (xattr_obj) { + /* Add extended attribute directory to the unlinked set. */ + VERIFY3U(0, ==, + zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + + if (xattr_obj) { + /* + * We're using the FreeBSD taskqueue API here instead of + * the Solaris taskq API since the FreeBSD API allows for a + * task to be enqueued multiple times but executed once. + */ + taskqueue_enqueue(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + } +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dzp. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (zp_is_dir) { + if (dzp->z_links >= ZFS_LINK_MAX) + return (SET_ERROR(EMLINK)); + } + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + return (SET_ERROR(ENOENT)); + } + if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { + return (SET_ERROR(EMLINK)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } else { + ASSERT(zp->z_unlinked == 0); + } + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, + 8, 1, &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + zp->z_links--; + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, + name, mt, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); + } + + return (error); +} + +/* + * Unlink zp from dzp, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (!(flag & ZRENAMING)) { + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) { + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p is %u, " + "should be at least %u", zp->z_vnode, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT0(error); + } else { + ASSERT(zp->z_unlinked == 0); + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) + return (error); + } + + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_size == 2); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t parent __unused; + + *xvpp = NULL; + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + *xvpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + vattr_t va; + int error; +top: + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + return (0); + } + + + if (!(flags & CREATE_XATTR_DIR)) + return (SET_ERROR(ENOATTR)); + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + if (error == 0) + VOP_UNLOCK1(ZTOV(*xzpp)); + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c new file mode 100644 index 000000000..ec7c04717 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_recv.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_file.h> +#include <sys/buf.h> +#include <sys/stat.h> + +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct thread *td; + int rc, fd; + + td = curthread; + pwd_ensure_dirs(); + /* 12.x doesn't take a const char * */ + rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), + UIO_SYSSPACE, flags, mode); + if (rc) + return (SET_ERROR(rc)); + fd = td->td_retval[0]; + td->td_retval[0] = 0; + if (fget(curthread, fd, &cap_no_rights, fpp)) + kern_close(td, fd); + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + fo_close(fp, curthread); +} + +static int +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FWRITE) == 0) + return (SET_ERROR(EBADF)); + + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + + rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + if (resid) + *resid = auio.uio_resid; + else if (auio.uio_resid) + return (SET_ERROR(EIO)); + *offp += count - auio.uio_resid; + return (rc); +} + +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + + return (SET_ERROR(rc)); +} + +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_write_impl(fp, buf, count, &off, resid)); +} + +static int +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FREAD) == 0) + return (SET_ERROR(EBADF)); + + rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + *resid = auio.uio_resid; + *offp += count - auio.uio_resid; + return (SET_ERROR(0)); +} + +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + return (rc); +} + +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_read_impl(fp, buf, count, &off, resid)); +} + +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + int rc; + struct thread *td; + + td = curthread; + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) + return (SET_ERROR(ESPIPE)); + rc = fo_seek(fp, *offp, whence, td); + if (rc == 0) + *offp = td->td_uretoff.tdu_off; + return (SET_ERROR(rc)); +} + +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct thread *td; + struct stat sb; + int rc; + + td = curthread; + + rc = fo_stat(fp, &sb, td->td_ucred, td); + if (rc) + return (SET_ERROR(rc)); + zfattr->zfa_size = sb.st_size; + zfattr->zfa_mode = sb.st_mode; + + return (0); +} + +static __inline int +zfs_vop_fsync(vnode_t *vp) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto drop; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(vp, MNT_WAIT, curthread); + VOP_UNLOCK1(vp); + vn_finished_write(mp); +drop: + return (SET_ERROR(error)); +} + +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + struct vnode *v; + + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + + v = fp->f_data; + return (zfs_vop_fsync(v)); +} + +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + struct file *fp; + + if (fget(curthread, fd, &cap_no_rights, &fp)) + return (SET_ERROR(EBADF)); + + *fpp = fp; + return (0); +} + +void +zfs_file_put(int fd) +{ + struct file *fp; + + /* No CAP_ rights required, as we're only releasing. */ + if (fget(curthread, fd, &cap_no_rights, &fp) == 0) { + fdrop(fp, curthread); + fdrop(fp, curthread); + } +} + +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_offset); +} + +void * +zfs_file_private(zfs_file_t *fp) +{ + file_t *tmpfp; + void *data; + int error; + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + curthread->td_fpop = tmpfp; + if (error != 0) + return (NULL); + return (data); +} + +int +zfs_file_unlink(const char *fnamep) +{ + enum uio_seg seg = UIO_SYSSPACE; + int rc; + +#if __FreeBSD_version >= 1300018 + rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); +#else +#ifdef AT_BENEATH + rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0); +#else + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0); +#endif +#endif + return (SET_ERROR(rc)); +} diff --git a/module/os/freebsd/zfs/zfs_fuid_os.c b/module/os/freebsd/zfs/zfs_fuid_os.c new file mode 100644 index 000000000..ebd09abd6 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_fuid_os.c @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/avl.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/nvpair.h> +#ifdef _KERNEL +#include <sys/sid.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#endif +#include <sys/zfs_fuid.h> + +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); +} diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c new file mode 100644 index 000000000..4b7e86467 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_os.h> +#include <sys/vdev_impl.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_redact.h> +#include <sys/dmu_tx.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/nvpair.h> +#include <sys/pathname.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> +#include <sys/dsl_scan.h> +#include <sys/fm/util.h> +#include <sys/dsl_crypt.h> + +#include <sys/dmu_recv.h> +#include <sys/dmu_send.h> +#include <sys/dmu_recv.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_bookmark.h> +#include <sys/dsl_userhold.h> +#include <sys/zfeature.h> +#include <sys/zcp.h> +#include <sys/zio_checksum.h> +#include <sys/vdev_removal.h> +#include <sys/vdev_trim.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_initialize.h> +#include <sys/zfs_ioctl_impl.h> + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + int error = 0; + + if (*zfvp == NULL) + return (SET_ERROR(ESRCH)); + + error = vfs_busy((*zfvp)->z_vfs, 0); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + return (error); +} + +int +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_vfs != NULL); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + vfs_unbusy(zfsvfs->z_vfs); +} + +static const zfs_ioc_key_t zfs_keys_nextboot[] = { + {"command", DATA_TYPE_STRING, 0}, + { ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0}, + { ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0} +}; + +static int +zfs_ioc_jail(zfs_cmd_t *zc) +{ + + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ + + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char name[MAXNAMELEN]; + spa_t *spa; + vdev_t *vd; + char *command; + uint64_t pool_guid; + uint64_t vdev_guid; + int error; + + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (EINVAL); + if (nvlist_lookup_string(innvl, + "command", &command) != 0) + return (EINVAL); + + mutex_enter(&spa_namespace_lock); + spa = spa_by_guid(pool_guid, vdev_guid); + if (spa != NULL) + strcpy(name, spa_name(spa)); + mutex_exit(&spa_namespace_lock); + if (spa == NULL) + return (ENOENT); + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENXIO); + spa_close(spa, FTAG); + return (ENODEV); + } + error = vdev_label_write_pad2(vd, command, strlen(command)); + (void) spa_vdev_state_exit(spa, NULL, 0); + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_close(spa, FTAG); + return (error); +} + + +void +zfs_ioctl_init_os(void) +{ + zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, + zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3); + +} diff --git a/module/os/freebsd/zfs/zfs_onexit_os.c b/module/os/freebsd/zfs/zfs_onexit_os.c new file mode 100644 index 000000000..8b22f2fdc --- /dev/null +++ b/module/os/freebsd/zfs/zfs_onexit_os.c @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/sunddi.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_onexit.h> + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_state(minor, ZST_ONEXIT); + if (*zo == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp, *tmpfp; + zfs_onexit_t *zo; + void *data; + int error; + + if ((error = zfs_file_get(fd, &fp))) + return (error); + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + if (error == 0) + *minorp = (minor_t)(uintptr_t)data; + curthread->td_fpop = tmpfp; + if (error != 0) + return (SET_ERROR(EBADF)); + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + zfs_file_put(fd); +} diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c new file mode 100644 index 000000000..d6f7fc11e --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -0,0 +1,2448 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek <[email protected]>. + * All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/acl.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/mntent.h> +#include <sys/mount.h> +#include <sys/cmn_err.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/policy.h> +#include <sys/atomic.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/sunddi.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/spa_boot.h> +#include <sys/jail.h> +#include <ufs/ufs/quota.h> +#include <sys/zfs_quota.h> + +#include "zfs_comutil.h" + +#ifndef MNTK_VMSETSIZE_BUG +#define MNTK_VMSETSIZE_BUG 0 +#endif +#ifndef MNTK_NOMSYNC +#define MNTK_NOMSYNC 8 +#endif + +/* BEGIN CSTYLED */ +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + +int zfs_debug_level; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, + "Debug level"); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); +/* END CSTYLED */ + +static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); +static int zfs_mount(vfs_t *vfsp); +static int zfs_umount(vfs_t *vfsp, int fflag); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor); +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors); +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); +static void zfs_freevfs(vfs_t *vfsp); + +struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, +#if __FreeBSD_version >= 1300049 + .vfs_root = vfs_cache_root, + .vfs_cachedroot = zfs_root, +#else + .vfs_root = zfs_root, +#endif + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_checkexp = zfs_checkexp, + .vfs_fhtovp = zfs_fhtovp, + .vfs_quotactl = zfs_quotactl, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + error = getzfsvfs_impl(os, &zfvp); + if (error != 0) + return (error); + if (zfvp == NULL) + return (ENOENT); + vfsp = zfvp->z_vfs; + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) + tmp = 1; + break; + case ZFS_PROP_DEVICES: + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + tmp = 1; + break; + case ZFS_PROP_EXEC: + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + tmp = 1; + break; + case ZFS_PROP_SETUID: + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + tmp = 1; + break; + case ZFS_PROP_READONLY: + if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) + tmp = 1; + break; + case ZFS_PROP_XATTR: + if (zfvp->z_flags & ZSB_XATTR) + tmp = zfvp->z_xattr; + break; + case ZFS_PROP_NBMAND: + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) + tmp = 1; + break; + default: + vfs_unbusy(vfsp); + return (ENOENT); + } + + vfs_unbusy(vfsp); + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; + } + return (0); +} + +static int +zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) +{ + int error = 0; + char buf[32]; + uint64_t usedobj, quotaobj; + uint64_t quota, used = 0; + timespec_t now; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) { + error = ENOENT; + goto done; + } + (void) sprintf(buf, "%llx", (longlong_t)id); + if ((error = zap_lookup(zfsvfs->z_os, quotaobj, + buf, sizeof (quota), 1, "a)) != 0) { + dprintf("%s(%d): quotaobj lookup failed\n", + __FUNCTION__, __LINE__); + goto done; + } + /* + * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". + * So we set them to be the same. + */ + dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); + error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); + if (error && error != ENOENT) { + dprintf("%s(%d): usedobj failed; %d\n", + __FUNCTION__, __LINE__, error); + goto done; + } + dqp->dqb_curblocks = btodb(used); + dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; + vfs_timestamp(&now); + /* + * Setting this to 0 causes FreeBSD quota(8) to print + * the number of days since the epoch, which isn't + * particularly useful. + */ + dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; +done: + return (error); +} + +static int +zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct thread *td; + int cmd, type, error = 0; + int bitsize; + zfs_userquota_prop_t quota_type; + struct dqblk64 dqblk = { 0 }; + + td = curthread; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + + ZFS_ENTER(zfsvfs); + if (id == -1) { + switch (type) { + case USRQUOTA: + id = td->td_ucred->cr_ruid; + break; + case GRPQUOTA: + id = td->td_ucred->cr_rgid; + break; + default: + error = EINVAL; + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); + goto done; + } + } + /* + * Map BSD type to: + * ZFS_PROP_USERUSED, + * ZFS_PROP_USERQUOTA, + * ZFS_PROP_GROUPUSED, + * ZFS_PROP_GROUPQUOTA + */ + switch (cmd) { + case Q_SETQUOTA: + case Q_SETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERQUOTA; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPQUOTA; + else + error = EINVAL; + break; + case Q_GETQUOTA: + case Q_GETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERUSED; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPUSED; + else + error = EINVAL; + break; + } + + /* + * Depending on the cmd, we may need to get + * the ruid and domain (see fuidstr_to_sid?), + * the fuid (how?), or other information. + * Create fuid using zfs_fuid_create(zfsvfs, id, + * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? + * I think I can use just the id? + * + * Look at zfs_id_overquota() to look up a quota. + * zap_lookup(something, quotaobj, fuidstring, + * sizeof (long long), 1, "a) + * + * See zfs_set_userquota() to set a quota. + */ + if ((uint32_t)type >= MAXQUOTAS) { + error = EINVAL; + goto done; + } + + switch (cmd) { + case Q_GETQUOTASIZE: + bitsize = 64; + error = copyout(&bitsize, arg, sizeof (int)); + break; + case Q_QUOTAON: + // As far as I can tell, you can't turn quotas on or off on zfs + error = 0; + vfs_unbusy(vfsp); + break; + case Q_QUOTAOFF: + error = ENOTSUP; + vfs_unbusy(vfsp); + break; + case Q_SETQUOTA: + error = copyin(&dqblk, arg, sizeof (dqblk)); + if (error == 0) + error = zfs_set_userquota(zfsvfs, quota_type, + "", id, dbtob(dqblk.dqb_bhardlimit)); + break; + case Q_GETQUOTA: + error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); + if (error == 0) + error = copyout(&dqblk, arg, sizeof (dqblk)); + break; + default: + error = EINVAL; + break; + } +done: + ZFS_EXIT(zfsvfs); + return (error); +} + + +boolean_t +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); +} + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor) +{ + + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * Ignore the system syncher. ZFS already commits async data + * at zfs_txg_timeout intervals. + */ + if (waitfor == MNT_LAZY) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; + int error; + + error = vfs_stdsync(vfsp, waitfor); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (rebooting && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_flags &= ~ZSB_XATTR; + } else { + zfsvfs->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t do_xattr = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * We need to enter pool configuration here, so that we can use + * dsl_prop_get_int_ds() to handle the special nbmand property below. + * dsl_prop_get_integer() can not be used, because it has to acquire + * spa_namespace_lock and we can not do that because we already hold + * z_teardown_lock. The problem is that spa_write_cachefile() is called + * with spa_namespace_lock held and the function calls ZFS vnode + * operations to write the cache file and thus z_teardown_lock is + * acquired after spa_namespace_lock. + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + return (error); + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], + 8, 1, &zfsvfs->z_projectquota_obj); + if (error == ENOENT) + zfsvfs->z_projectquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], + 8, 1, &zfsvfs->z_userobjquota_obj); + if (error == ENOENT) + zfsvfs->z_userobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], + 8, 1, &zfsvfs->z_groupobjquota_obj); + if (error == ENOENT) + zfsvfs->z_groupobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], + 8, 1, &zfsvfs->z_projectobjquota_obj); + if (error == ENOENT) + zfsvfs->z_projectobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + + return (0); +} + +taskq_t *zfsvfs_taskq; + +static void +zfsvfs_task_unlinked_drain(void *context, int pending __unused) +{ + + zfs_unlinked_drain((zfsvfs_t *)context); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); + + /* + * XXX: Fix struct statfs so this isn't necessary! + * + * The 'osname' is used as the filesystem's special node, which means + * it must fit in statfs.f_mntfromname, or else it can't be + * enumerated, so libzfs_mnttab_find() returns NULL, which causes + * 'zfs unmount' to think it's not mounted when it is. + */ + if (strlen(osname) >= MNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, + &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + + return (error); +} + + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, + zfsvfs_task_unlinked_drain, zfsvfs); +#ifdef DIAGNOSTIC + rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); +#else + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); +#endif + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) + return (SET_ERROR(EROFS)); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) { + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + } else { + dsl_dir_t *dd; + + zfs_unlinked_drain(zfsvfs); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; + } + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + boolean_t use_nc = zfsvfs->z_use_namecache; + zfsvfs->z_use_namecache = B_FALSE; + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + zfsvfs->z_use_namecache = use_nc; + } + } + + /* restore readonly bit */ + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i; + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + ASSERT(zfsvfs->z_nr_znodes == 0); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) +{ + uint64_t recordsize, fsid_guid; + int error = 0; + zfsvfs_t *zfsvfs; + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + if ((error = dsl_prop_get_integer(osname, + "recordsize", &recordsize, NULL))) + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + /* + * This can cause a loss of coherence between ARC and page cache + * on ZoF - unclear if the problem is in FreeBSD or ZoF + */ + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ + vfsp->mnt_kern_flag |= MNTK_NOMSYNC; + vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + (vfsp->mnt_vfc->vfc_typenum & 0xFF); + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, + "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + vfs_mountedfrom(vfsp, osname); + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (SET_ERROR(EINVAL)); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (SET_ERROR(EINVAL)); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : EACCES); + } + return (SET_ERROR(EACCES)); +} + +/* + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns 0 if access allowed, error otherwise (e.g. EACCES) + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (SET_ERROR(EACCES)); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (SET_ERROR(EACCES)); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(vfsp->vfs_mntpt, B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (SET_ERROR(EACCES)); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp) +{ + kthread_t *td = curthread; + vnode_t *mvp = vfsp->mnt_vnodecovered; + cred_t *cr = td->td_ucred; + char *osname; + int error = 0; + int canwrite; + + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (SET_ERROR(EINVAL)); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) + goto out; + + if (!(vfsp->vfs_flag & MS_REMOUNT)) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK1(mvp); + goto out; + } + + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK1(mvp); + goto out; + } + VOP_UNLOCK1(mvp); + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = SET_ERROR(EPERM); + goto out; + } + +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + + vfsp->vfs_flag |= MNT_NFS4ACLS; + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) { + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Refresh mount options with z_teardown_lock blocking I/O while + * the filesystem is in an inconsistent state. + * The lock also serializes this code with filesystem + * manipulations between entry to zfs_suspend_fs() and return + * from zfs_resume_fs(). + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + goto out; + } + + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname); + if (error) + goto out; + } + DROP_GIANT(); + error = zfs_domount(vfsp, osname); + PICKUP_GIANT(); + +out: + return (error); +} + +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + statp->f_version = STATFS_VERSION; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_bsize = SPA_MINBLOCKSIZE; + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes / statp->f_bsize; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_files = statp->f_ffree + usedobjs; + + /* + * We're a zfs filesystem. + */ + strlcpy(statp->f_fstypename, "zfs", + sizeof (statp->f_fstypename)); + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + sizeof (statp->f_mntfromname)); + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + sizeof (statp->f_mntonname)); + + statp->f_namemax = MAXNAMELEN - 1; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + + if (error == 0) { + error = vn_lock(*vpp, flags); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } + } + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + dsl_dir_t *dd; + + /* + * If someone has not already unmounted this file system, + * drain the zrele_taskq to ensure all active references to the + * zfsvfs_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but zreles run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (zfsvfs->z_nr_znodes > 0) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + } + } + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ +#ifdef FREEBSD_NAMECACHE + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); +#endif + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_sa_hdl) { + ASSERT(ZTOV(zp)->v_count >= 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (!zfs_is_readonly(zfsvfs)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + dmu_objset_evict_dbufs(zfsvfs->z_os); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag) +{ + kthread_t *td = curthread; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + cred_t *cr = td->td_ucred; + int ret; + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + if (dsl_deleg_access((char *)vfsp->vfs_resource, + ZFS_DELEG_PERM_MOUNT, cr)) + return (ret); + } + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + } + + if (fflag & MS_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * Flush all the files. + */ + ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); + if (ret != 0) + return (ret); + while (taskqueue_cancel(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task, NULL) != 0) + taskqueue_drain(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + zfs_freevfs(vfsp); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; + + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); + + ZFS_ENTER(zfsvfs); + err = zfs_zget(zfsvfs, ino, &zp); + if (err == 0 && zp->z_unlinked) { + vrele(ZTOV(zp)); + err = EINVAL; + } + if (err == 0) + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + if (err == 0) { + err = vn_lock(*vpp, flags); + if (err != 0) + vrele(*vpp); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); + +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) +{ + struct componentname cn; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + vnode_t *dvp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + /* + * On FreeBSD we can get snapshot's mount point or its parent file + * system mount point depending if snapshot is already mounted or not. + */ + if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * A zero fid_gen means we are in .zfs or the .zfs/snapshot + * directory tree. If the object == zfsvfs->z_shares_dir, then + * we are in the .zfs/shares directory tree. + */ + if ((fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || + (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { + ZFS_EXIT(zfsvfs); + VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); + if (object == ZFSCTL_INO_SNAPDIR) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | LOCKLEAF; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else if (object == zfsvfs->z_shares_dir) { + /* + * XXX This branch must not be taken, + * if it is, then the lookup below will + * explode. + */ + cn.cn_nameptr = "shares"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else { + *vpp = dvp; + } + return (err); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if ((err = zfs_zget(zfsvfs, object, &zp))) { + ZFS_EXIT(zfsvfs); + return (err); + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + vrele(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + err = vn_lock(*vpp, flags); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + +bail: + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { + vfs_ref(zfsvfs->z_vfs); + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); +} + +#ifdef __i386__ +static int desiredvnodes_backup; +#endif + +static void +zfs_vnodes_adjust(void) +{ +#ifdef __i386__ + int newdesiredvnodes; + + desiredvnodes_backup = desiredvnodes; + + /* + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. + */ + newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof (struct vm_object) + + sizeof (struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif +} + +static void +zfs_vnodes_adjust_back(void) +{ + +#ifdef __i386__ + desiredvnodes = desiredvnodes_backup; +#endif +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + + zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); +} + +void +zfs_fini(void) +{ + taskq_destroy(zfsvfs_taskq); + zfsctl_fini(); + zfs_znode_fini(); + zfs_vnodes_adjust_back(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_vfs, 0); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %lu to %lu", zfsvfs->z_version, newvers); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the coresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void) strlcpy(fromname, newname, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", + newname, fromname + oldlen); + (void) strlcpy(fromname, tmpbuf, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); +} +#endif diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c new file mode 100644 index 000000000..d7b92035f --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -0,0 +1,6533 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vm.h> +#include <sys/vnode.h> +#include <sys/dirent.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/uio.h> +#include <sys/atomic.h> +#include <sys/namei.h> +#include <sys/mman.h> +#include <sys/cmn_err.h> +#include <sys/kdb.h> +#include <sys/sysproto.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/filio.h> +#include <sys/sid.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_quota.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_rlock.h> +#include <sys/extdirent.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/sched.h> +#include <sys/acl.h> +#include <sys/vmmeter.h> +#include <vm/vm_param.h> +#include <sys/zil.h> +#include <sys/zfs_vnops.h> + +#include <vm/vm_object.h> + +#include <sys/extattr.h> +#include <sys/priv.h> + +#ifndef VN_OPEN_INVFS +#define VN_OPEN_INVFS 0x0 +#endif + +#if __FreeBSD_version >= 1300047 +#define vm_page_wire_lock(pp) +#define vm_page_wire_unlock(pp) +#else +#define vm_page_wire_lock(pp) vm_page_lock(pp) +#define vm_page_wire_unlock(pp) vm_page_unlock(pp) +#endif + +static int +zfs_u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum) +{ + + return (u8_validate(__DECONST(char *, u8str), n, list, flag, errnum)); +} +#define u8_validate zfs_u8_validate + +#ifdef DEBUG_VFS_LOCKS +#define VNCHECKREF(vp) \ + VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ + ("%s: wrong ref counts", __func__)); +#else +#define VNCHECKREF(vp) +#endif + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, + int *rvalp) +{ + offset_t off; + int error; + zfsvfs_t *zfsvfs; + znode_t *zp; + + switch (com) { + case _FIOFFS: + { + return (0); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + } + case _FIOGDIO: + case _FIOSDIO: + { + return (0); + } + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + { + off = *(offset_t *)data; + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + *(offset_t *)data = off; + return (0); + } + } + return (SET_ERROR(ENOTTY)); +} + +static vm_page_t +page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + int64_t end; + + /* + * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE + * aligned boundaries, if the range is not aligned. As a result a + * DEV_BSIZE subrange with partially dirty data may get marked as clean. + * It may happen that all DEV_BSIZE subranges are marked clean and thus + * the whole page would be considred clean despite have some dirty data. + * For this reason we should shrink the range to DEV_BSIZE aligned + * boundaries before calling vm_page_clear_dirty. + */ + end = rounddown2(off + nbytes, DEV_BSIZE); + off = roundup2(off, DEV_BSIZE); + nbytes = end - off; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); +#if __FreeBSD_version < 1300050 + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + vm_page_sbusy(pp); + } else if (pp != NULL) { + ASSERT(!pp->valid); + pp = NULL; + } + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } + break; + } +#else + vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | + VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } +#endif + return (pp); +} + +static void +page_unbusy(vm_page_t pp) +{ + + vm_page_sunbusy(pp); +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(pp->object); +#else + vm_object_pip_subtract(pp->object, 1); +#endif +} + +#if __FreeBSD_version > 1300051 +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t m; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | + VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY); + return (m); +} +#else +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_wire_lock(pp); + vm_page_hold(pp); + vm_page_wire_unlock(pp); + + } else + pp = NULL; + break; + } + return (pp); +} +#endif + +static void +page_unhold(vm_page_t pp) +{ + + vm_page_wire_lock(pp); +#if __FreeBSD_version >= 1300035 + vm_page_unwire(pp, PQ_ACTIVE); +#else + vm_page_unhold(pp); +#endif + vm_page_wire_unlock(pp); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) +{ + vm_object_t obj; + struct sf_buf *sf; + caddr_t va; + int off; + + ASSERT(segflg != UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + off = start & PAGEOFFSET; + zfs_vmobject_wlock(obj); +#if __FreeBSD_version >= 1300041 + vm_object_pip_add(obj, 1); +#endif + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + int nbytes = imin(PAGESIZE - off, len); + + if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { + zfs_vmobject_wunlock(obj); + + va = zfs_map_page(pp, &sf); + (void) dmu_read(os, oid, start+off, nbytes, + va+off, DMU_READ_PREFETCH); + zfs_unmap_page(sf); + + zfs_vmobject_wlock(obj); + page_unbusy(pp); + } + len -= nbytes; + off = 0; + } +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(obj); +#else + vm_object_pip_wakeupn(obj, 0); +#endif + zfs_vmobject_wunlock(obj); +} + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain exclusive busy on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int error = 0; + + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + zfs_vmobject_wlock(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (vm_page_none_valid(pp)) { + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + vm_page_do_sunbusy(pp); +#if __FreeBSD_version >= 1300047 && __FreeBSD_version < 1300051 +#error "unsupported version window" +#elif __FreeBSD_version >= 1300051 + if (error == 0) { + vm_page_valid(pp); + vm_page_lock(pp); + vm_page_activate(pp); + vm_page_unlock(pp); + } + vm_page_do_sunbusy(pp); + if (error != 0 && !vm_page_wired(pp) == 0 && + pp->valid == 0 && vm_page_tryxbusy(pp)) + vm_page_free(pp); +#else + vm_page_lock(pp); + if (error) { + if (pp->wire_count == 0 && pp->valid == 0 && + !vm_page_busied(pp)) + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); +#endif + } else { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + zfs_vmobject_wunlock(obj); + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + vm_object_t obj; + int64_t start; + int len = nbytes; + int off; + int error = 0; + + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + zfs_vmobject_wlock(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if ((pp = page_hold(vp, start))) { + struct sf_buf *sf; + caddr_t va; + + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = vn_io_fault_uiomove(va + off, bytes, uio); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + page_unhold(pp); + } else { + zfs_vmobject_wunlock(obj); + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + zfs_vmobject_wlock(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + zfs_vmobject_wunlock(obj); + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ssize_t n, nbytes; + int error = 0; + zfs_locked_range_t *lr; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (zfsvfs->z_log && + (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset, + uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else if (vn_has_cached_data(vp)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } +out: + zfs_rangelock_exit(lr); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is + * set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = MAXOFFSET_T; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + zfs_locked_range_t *lr; + int max_blksz = zfsvfs->z_max_blksz; + int error = 0; + arc_buf_t *abuf; + iovec_t *aiov = NULL; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt __unused = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; + uint64_t uid, gid, projid; + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + zilog = zfsvfs->z_log; + + /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & FAPPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (EFBIG); + } + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); + + uid = zp->z_uid; + gid = zp->z_gid; + projid = zp->z_projid; + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio->uio_loffset; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + abuf = NULL; + if (xuio) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (n >= max_blksz && + woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, + abuf, tx); + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, uio->uio_segflg, tx); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); + ASSERT(error == 0 || error == EFAULT); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, + ioflag, NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + + } + + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * EFAULT means that at least one page of the source buffer was not + * available. VFS will re-try remaining I/O upon this error. + */ + if (error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *presid) +{ + int error = 0; + ssize_t resid; + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, + UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); + + if (error) { + return (SET_ERROR(error)); + } else if (presid == NULL) { + if (resid != 0) { + error = SET_ERROR(EIO); + } + } else { + *presid = resid; + } + return (error); +} + +void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(os))); + return (SET_ERROR(ENOENT)); + } + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, + size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP, + * so there's no need to zio_flush() its + * vdevs (flushing would needlesly hurt + * performance, and doesn't work on + * indirect vdevs). + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + int error; + + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); + return (error); +} + +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; + int error; + int ltype; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(dvp, __func__); +#ifdef DIAGNOSTIC + if ((zdp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); +#endif + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, + int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) +{ + znode_t *zdp = VTOZ(dvp); + znode_t *zp; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & LOOKUP_XATTR)) { + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + *vpp = ZTOV(zp); + + /* + * Do we have permission to get into attribute directory? + */ + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); + if (error) { + vrele(ZTOV(zp)); + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory if we're not coming in via + * VOP_CACHEDLOOKUP. + */ + if (!cached) { +#ifdef NOEXECCHECK + if ((cnp->cn_flags & NOEXECCHECK) != 0) { + cnp->cn_flags &= ~NOEXECCHECK; + } else +#endif + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + + /* + * First handle the special cases. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { + struct componentname cn; + vnode_t *zfsctl_vp; + int ltype; + + ZFS_EXIT(zfsvfs); + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK1(dvp); + error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, + &zfsctl_vp); + if (error == 0) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = cnp->cn_nameiop; + cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; + cn.cn_lkflags = cnp->cn_lkflags; + error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); + vput(zfsctl_vp); + } + vn_lock(dvp, ltype | LK_RETRY); + return (error); + } + } + if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { + ZFS_EXIT(zfsvfs); + if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); + return (error); + } + + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + + if (error != 0) + *vpp = NULL; + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, + znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + uint64_t projid = ZFS_DEFAULT_PROJID; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype; +#ifdef DEBUG_VFS_LOCKS + vnode_t *dvp = ZTOV(dzp); +#endif + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + *zpp = NULL; + + if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~S_ISVTX; + + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + +out: + VNCHECKREF(dvp); + if (error == 0) { + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ + +/*ARGSUSED*/ +static int +zfs_remove_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t xattr_obj; + uint64_t obj = 0; + dmu_tx_t *tx; + boolean_t unlinked; + uint64_t txtype; + int error; + + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zp = VTOZ(vp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + xattr_obj = 0; + xzp = NULL; + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = SET_ERROR(EPERM); + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + obj = zp->z_id; + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + + if (xzp) { + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + zfs_unlinked_add(zp, tx); + vp->v_vflag |= VV_NOSYNC; + } + /* XXX check changes to linux vnops */ + txtype = TX_REMOVE; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + + if (xzp) + vrele(ZTOV(xzp)); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + + ZFS_EXIT(zfsvfs); + return (error); +} + + +int +zfs_lookup_internal(znode_t *dzp, char *name, vnode_t **vpp, + struct componentname *cnp, int nameiop) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + cnp->cn_nameptr = name; + cnp->cn_namelen = strlen(name); + cnp->cn_nameiop = nameiop; + cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cnp->cn_cred = kcred; + cnp->cn_thread = curthread; + + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { + struct vop_lookup_args a; + + a.a_gen.a_desc = &vop_lookup_desc; + a.a_dvp = ZTOV(dzp); + a.a_vpp = vpp; + a.a_cnp = cnp; + error = vfs_cache_lookup(&a); + } else { + error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, + curthread, 0, B_FALSE); + } +#ifdef ZFS_DEBUG + if (error) { + printf("got error %d on name %s on op %d\n", error, name, + nameiop); + kdb_backtrace(); + } +#endif + return (error); +} + +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + vnode_t *vp; + int error; + struct componentname cn; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_remove_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, + int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t txtype; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + ((vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ + *zpp = NULL; + + if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (vp->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + cache_purge(dvp); + + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + zfs_log_remove(zilog, tx, txtype, dzp, name, + ZFS_NO_OBJECT, B_FALSE); + } + + dmu_tx_commit(tx); + + cache_purge(vp); +out: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) +{ + struct componentname cn; + vnode_t *vp; + int error; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure). + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int *ncookies, ulong_t **cookies) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + uint8_t type; + int ncooks; + ulong_t *cooks = NULL; + int flags = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + outbuf = NULL; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + if (ncookies != NULL) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncooks = uio->uio_resid / (sizeof (struct dirent) - + sizeof (((struct dirent *)NULL)->d_name) + 1); + cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); + *cookies = cooks; + *ncookies = ncooks; + } + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next = NULL; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } + } + + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + vrele(ZTOV(ezp)); + goto skip_entry; + } + vrele(ZTOV(ezp)); + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = SET_ERROR(EINVAL); + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + /* NOTE: d_off is the offset for the *next* entry. */ + next = &odp->d_off; + strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + dirent_terminate(odp); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + skip_entry: + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Fill the offset right after advancing the cursor. */ + if (next != NULL) + *next = offset; + if (cooks != NULL) { + *cooks++ = offset; + ncooks--; + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); + } + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* Subtract unused cookies */ + if (ncookies != NULL) + *ncookies -= ncooks; + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + if (error != 0 && cookies != NULL) { + free(*cookies, M_TEMP); + *cookies = NULL; + *ncookies = 0; + } + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds). + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint32_t blksize; + u_longlong_t nblocks; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vp->v_type == VBLK || vp->v_type == VCHR) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; + vn_fsid(vp, vap); + vap->va_nodeid = zp->z_id; + vap->va_nlink = zp->z_links; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && + zp->z_links < ZFS_LINK_MAX) + vap->va_nlink++; + vap->va_size = zp->z_size; + if (vp->v_type == VBLK || vp->v_type == VCHR) + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + vap->va_filerev = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + xoap->xoa_projinherit = + ((zp->z_pflags & ZFS_PROJINHERIT) != 0); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + xoap->xoa_projid = zp->z_projid; + XVA_SET_RTN(xvap, XAT_PROJID); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); + + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + xva_init(&tmpxvattr); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + } + if (xoap != NULL && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && + TIMESPEC_OVERFLOW(&vap->va_birthtime)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + } + + attrzp = NULL; + aclp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, vp, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) { + vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err == 0) { + err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); + if (err != 0) + vrele(ZTOV(attrzp)); + } + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_uid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_gid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = SET_ERROR(EPERM); + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & AT_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3U((uintptr_t)aclp, !=, 0); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + } + + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime); + } + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + xoap->xoa_createtime = vap->va_birthtime; + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(vp->v_type == VREG); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + } +out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (attrzp) + vput(ZTOV(attrzp)); + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. + */ +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) +{ + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + + VOP_UNLOCK1(tdvp); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK1(*tvpp); + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); + + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK1(tdvp); + goto relock; + } + tdzp = VTOZ(tdvp); + + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + error = SET_ERROR(EIO); + goto out; + } + + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); + + /* + * Re-resolve tvp, if it disappeared we just carry on. + */ + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK1(nvp); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } + vrele(*svpp); + *svpp = nvp; + goto relock; + } + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + VOP_UNLOCK1(*svpp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + vput(nvp); + goto relock; + } + *tvpp = nvp; + } + + return (0); + +out: + return (error); +} + +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; + + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; + + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; + } + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; + + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; + + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr, int log) +{ + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; + dmu_tx_t *tx; + char *snm = scnp->cn_nameptr; + char *tnm = tcnp->cn_nameptr; + int error = 0; + + /* Reject renames across filesystems. */ + if ((*svpp)->v_mount != tdvp->v_mount || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; + } + + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Lock all four vnodes to ensure safety and semantics of renaming. + */ + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); + } + + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + + /* + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. + */ + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + error = SET_ERROR(EILSEQ); + goto unlockout; + } + + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; + } + + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; + } + + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; + } + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } + + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_check(szp, sdzp, tdzp))) + goto unlockout; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); + } + } else { + if ((*tvpp)->v_type == VDIR) { + error = SET_ERROR(EISDIR); + goto unlockout; + } + } + } + + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); + if (tzp) + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto unlockout; + } + + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, + NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); + + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } + if (error == 0) { + cache_purge(*svpp); + if (*tvpp != NULL) + cache_purge(*tvpp); + cache_purge_negative(tdvp); + } + } + + dmu_tx_commit(tx); + +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(*svpp); + VOP_UNLOCK1(sdvp); + +out: /* original two vnodes are locked */ + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (*tvpp != NULL) + VOP_UNLOCK1(*tvpp); + if (tdvp != *tvpp) + VOP_UNLOCK1(tdvp); + return (error); +} + +int +zfs_rename(znode_t *sdzp, char *sname, znode_t *tdzp, char *tname, + cred_t *cr, int flags) +{ + struct componentname scn, tcn; + vnode_t *sdvp, *tdvp; + vnode_t *svp, *tvp; + int error; + svp = tvp = NULL; + + sdvp = ZTOV(sdzp); + tdvp = ZTOV(tdzp); + error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); + if (sdzp->z_zfsvfs->z_replay == B_FALSE) + VOP_UNLOCK1(sdvp); + if (error != 0) + goto fail; + VOP_UNLOCK1(svp); + + vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); + if (error == EJUSTRETURN) + tvp = NULL; + else if (error != 0) { + VOP_UNLOCK1(tdvp); + goto fail; + } + + error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0); +fail: + if (svp != NULL) + vrele(svp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, + 0 /* projid */)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + __DECONST(void *, link), len, tx); + else + zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + + zfs_log_symlink(zilog, tx, txtype, dzp, zp, + __DECONST(char *, name), __DECONST(char *, link)); + *zpp = zp; + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uio - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure containing the link path. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + znode_t *tzp; + zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + uint64_t parent; + uid_t owner; + + ASSERT(ZTOV(tdzp)->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (ZTOV(szp)->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (szp->z_pflags & (ZFS_APPENDONLY | + ZFS_IMMUTABLE | ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(tdzp, name, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + + dmu_tx_commit(tx); + + if (error == 0) { + vnevent_link(ZTOV(szp), ct); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: ip - inode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + + +CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid)); +CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid)); + +/*ARGSUSED*/ +static int +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + fidp->fid_len = size; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + + switch (cmd) { + case _PC_LINK_MAX: + *valp = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + case _PC_MIN_HOLE_SIZE: + *valp = (int)SPA_MINBLOCKSIZE; + return (0); + case _PC_ACL_EXTENDED: + *valp = 0; + return (0); + + case _PC_ACL_NFS4: + *valp = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *valp = ACL_MAX_ENTRIES; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, + int *rahead) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zp->z_zfsvfs->z_os; + zfs_locked_range_t *lr; + vm_object_t object; + off_t start, end, obj_size; + uint_t blksz; + int pgsin_b, pgsin_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + start = IDX_TO_OFF(ma[0]->pindex); + end = IDX_TO_OFF(ma[count - 1]->pindex + 1); + + /* + * Lock a range covering all required and optional pages. + * Note that we need to handle the case of the block size growing. + */ + for (;;) { + blksz = zp->z_blksz; + lr = zfs_rangelock_enter(&zp->z_rangelock, + rounddown(start, blksz), + roundup(end, blksz) - rounddown(start, blksz), RL_READER); + if (blksz == zp->z_blksz) + break; + zfs_rangelock_exit(lr); + } + + object = ma[0]->object; + zfs_vmobject_wlock(object); + obj_size = object->un_pager.vnp.vnp_size; + zfs_vmobject_wunlock(object); + if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (zfs_vm_pagerret_bad); + } + + pgsin_b = 0; + if (rbehind != NULL) { + pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); + pgsin_b = MIN(*rbehind, pgsin_b); + } + + pgsin_a = 0; + if (rahead != NULL) { + pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); + if (end + IDX_TO_OFF(pgsin_a) >= obj_size) + pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); + pgsin_a = MIN(*rahead, pgsin_a); + } + + /* + * NB: we need to pass the exact byte size of the data that we expect + * to read after accounting for the file size. This is required because + * ZFS will panic if we request DMU to read beyond the end of the last + * allocated block. + */ + error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, + MIN(end, obj_size) - (end - PAGE_SIZE)); + + zfs_rangelock_exit(lr); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + if (error != 0) + return (zfs_vm_pagerret_error); + + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); + if (rbehind != NULL) + *rbehind = pgsin_b; + if (rahead != NULL) + *rahead = pgsin_a; + return (zfs_vm_pagerret_ok); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int *a_rbehind; + int *a_rahead; +}; +#endif + +static int +zfs_freebsd_getpages(struct vop_getpages_args *ap) +{ + + return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead)); +} + +static int +zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, + int *rtvals) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + dmu_tx_t *tx; + struct sf_buf *sf; + vm_object_t object; + vm_page_t m; + caddr_t va; + size_t tocopy; + size_t lo_len; + vm_ooffset_t lo_off; + vm_ooffset_t off; + uint_t blksz; + int ncount; + int pcount; + int err; + int i; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + object = vp->v_object; + pcount = btoc(len); + ncount = pcount; + + KASSERT(ma[0]->object == object, ("mismatching object")); + KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); + + for (i = 0; i < pcount; i++) + rtvals[i] = zfs_vm_pagerret_error; + + off = IDX_TO_OFF(ma[0]->pindex); + blksz = zp->z_blksz; + lo_off = rounddown(off, blksz); + lo_len = roundup(len + (off - lo_off), blksz); + lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); + + zfs_vmobject_wlock(object); + if (len + off > object->un_pager.vnp.vnp_size) { + if (object->un_pager.vnp.vnp_size > off) { + int pgoff; + + len = object->un_pager.vnp.vnp_size - off; + ncount = btoc(len); + if ((pgoff = (int)len & PAGE_MASK) != 0) { + /* + * If the object is locked and the following + * conditions hold, then the page's dirty + * field cannot be concurrently changed by a + * pmap operation. + */ + m = ma[ncount - 1]; + vm_page_assert_sbusied(m); + KASSERT(!pmap_page_is_write_mapped(m), + ("zfs_putpages: page %p is not read-only", + m)); + vm_page_clear_dirty(m, pgoff, PAGE_SIZE - + pgoff); + } + } else { + len = 0; + ncount = 0; + } + if (ncount < pcount) { + for (i = ncount; i < pcount; i++) { + rtvals[i] = zfs_vm_pagerret_bad; + } + } + } + zfs_vmobject_wunlock(object); + + if (ncount == 0) + goto out; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + goto out; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz < PAGE_SIZE) { + for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { + tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + va = zfs_map_page(ma[i], &sf); + dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + zfs_unmap_page(sf); + } + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + /* + * XXX we should be passing a callback to undirty + * but that would make the locking messier + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, + len, 0, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); + } + dmu_tx_commit(tx); + +out: + zfs_rangelock_exit(lr); + if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (rtvals[0]); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_putpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; +}; +#endif + +int +zfs_freebsd_putpages(struct vop_putpages_args *ap) +{ + + return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, + ap->a_rtvals)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_bmap_args { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +}; +#endif + +static int +zfs_freebsd_bmap(struct vop_bmap_args *ap) +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_open_args { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_open(struct vop_open_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + int error; + + error = zfs_open(&vp, ap->a_mode, ap->a_cred); + if (error == 0) + vnode_create_vobject(vp, zp->z_size, ap->a_td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_close_args { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_close(struct vop_close_args *ap) +{ + + return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_ioctl_args { + struct vnode *a_vp; + ulong_t a_command; + caddr_t a_data; + int a_fflag; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_ioctl(struct vop_ioctl_args *ap) +{ + + return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, + ap->a_fflag, ap->a_cred, NULL)); +} + +static int +ioflags(int ioflags) +{ + int flags = 0; + + if (ioflags & IO_APPEND) + flags |= FAPPEND; + if (ioflags & IO_NDELAY) + flags |= FNONBLOCK; + if (ioflags & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_read_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_read(struct vop_read_args *ap) +{ + + return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_write_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_write(struct vop_write_args *ap) +{ + + return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_access_args { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_access(struct vop_access_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + accmode_t accmode; + int error = 0; + + + if (ap->a_accmode == VEXEC) { + if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) + return (0); + } + + /* + * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, + */ + accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) + error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); + + /* + * VADMIN has to be handled by vaccess(). + */ + if (error == 0) { + accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) { + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred, NULL); + } + } + + /* + * For VEXEC, ensure that at least one execute bit is set for + * non-directories. + */ + if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && + (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { + error = EACCES; + } + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + + ASSERT(cnp->cn_namelen < sizeof (nm)); + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); + + return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, cnp->cn_thread, 0, cached)); +} + +static int +zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) +{ + + return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_cache_lookup(struct vop_lookup_args *ap) +{ + zfsvfs_t *zfsvfs; + + zfsvfs = ap->a_dvp->v_mount->mnt_data; + if (zfsvfs->z_use_namecache) + return (vfs_cache_lookup(ap)); + else + return (zfs_freebsd_lookup(ap, B_FALSE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_create_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_create(struct vop_create_args *ap) +{ + zfsvfs_t *zfsvfs; + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc, mode; + + ASSERT(cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + mode = vap->va_mode & ALLPERMS; + zfsvfs = ap->a_dvp->v_mount->mnt_data; + *ap->a_vpp = NULL; + + rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + if (zfsvfs->z_use_namecache && + rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(ap->a_dvp, *ap->a_vpp, cnp); + + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_remove_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_remove(struct vop_remove_args *ap) +{ + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_mkdir(struct vop_mkdir_args *ap) +{ + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, + ap->a_cnp->cn_cred, 0, NULL); + + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_rmdir(struct vop_rmdir_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readdir_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + ulong_t **a_cookies; +}; +#endif + +static int +zfs_freebsd_readdir(struct vop_readdir_args *ap) +{ + + return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, + ap->a_ncookies, ap->a_cookies)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fsync_args { + struct vnode *a_vp; + int a_waitfor; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_fsync(struct vop_fsync_args *ap) +{ + + vop_stdfsync(ap); + return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_getattr(struct vop_getattr_args *ap) +{ + vattr_t *vap = ap->a_vap; + xvattr_t xvap; + ulong_t fflags = 0; + int error; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + xvap.xva_vattr.va_mask |= AT_XVATTR; + + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + XVA_SET_REQ(&xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&xvap, XAT_APPENDONLY); + XVA_SET_REQ(&xvap, XAT_NOUNLINK); + XVA_SET_REQ(&xvap, XAT_NODUMP); + XVA_SET_REQ(&xvap, XAT_READONLY); + XVA_SET_REQ(&xvap, XAT_ARCHIVE); + XVA_SET_REQ(&xvap, XAT_SYSTEM); + XVA_SET_REQ(&xvap, XAT_HIDDEN); + XVA_SET_REQ(&xvap, XAT_REPARSE); + XVA_SET_REQ(&xvap, XAT_OFFLINE); + XVA_SET_REQ(&xvap, XAT_SPARSE); + + error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); + if (error != 0) + return (error); + + /* Convert ZFS xattr into chflags. */ +#define FLAG_CHECK(fflag, xflag, xfield) do { \ + if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ + fflags |= (fflag); \ +} while (0) + FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHECK(UF_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHECK(UF_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHECK(UF_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHECK(UF_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); + +#undef FLAG_CHECK + *vap = xvap.xva_vattr; + vap->va_flags = fflags; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_setattr(struct vop_setattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cred = ap->a_cred; + xvattr_t xvap; + ulong_t fflags; + uint64_t zflags; + + vattr_init_mask(vap); + vap->va_mask &= ~AT_NOSET; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + + zflags = VTOZ(vp)->z_pflags; + + if (vap->va_flags != VNOVAL) { + zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; + int error; + + if (zfsvfs->z_use_fuids == B_FALSE) + return (EOPNOTSUPP); + + fflags = vap->va_flags; + /* + * XXX KDM + * We need to figure out whether it makes sense to allow + * UF_REPARSE through, since we don't really have other + * facilities to handle reparse points and zfs_setattr() + * doesn't currently allow setting that attribute anyway. + */ + if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| + UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| + UF_OFFLINE|UF_SPARSE)) != 0) + return (EOPNOTSUPP); + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + * Privileged jail processes behave like privileged non-jail + * processes if the PR_ALLOW_CHFLAGS permission bit is set; + * otherwise, they behave like unprivileged processes. + */ + if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || + spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { + error = securelevel_gt(cred, 0); + if (error != 0) + return (error); + } + } else { + /* + * Callers may only modify the file flags on + * objects they have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, + curthread)) != 0) + return (error); + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | + ZFS_NOUNLINK)) { + return (EPERM); + } + if (fflags & + (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { + return (EPERM); + } + } + +#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ + if (((fflags & (fflag)) && !(zflags & (zflag))) || \ + ((zflags & (zflag)) && !(fflags & (fflag)))) { \ + XVA_SET_REQ(&xvap, (xflag)); \ + (xfield) = ((fflags & (fflag)) != 0); \ + } \ +} while (0) + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); +#undef FLAG_CHANGE + } + if (vap->va_birthtime.tv_sec != VNOVAL) { + xvap.xva_vattr.va_mask |= AT_XVATTR; + XVA_SET_REQ(&xvap, XAT_CREATETIME); + } + return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rename_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; +}; +#endif + +static int +zfs_freebsd_rename(struct vop_rename_args *ap) +{ + vnode_t *fdvp = ap->a_fdvp; + vnode_t *fvp = ap->a_fvp; + vnode_t *tdvp = ap->a_tdvp; + vnode_t *tvp = ap->a_tvp; + int error; + + ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); + ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); + + error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred, 1); + + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_symlink_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; +}; +#endif + +static int +zfs_freebsd_symlink(struct vop_symlink_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(cnp->cn_flags & SAVENAME); + + vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, + ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readlink_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_readlink(struct vop_readlink_args *ap) +{ + + return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_link_args { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_link(struct vop_link_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *vp = ap->a_vp; + vnode_t *tdvp = ap->a_tdvp; + + if (tdvp->v_mount != vp->v_mount) + return (EXDEV); + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_link(VTOZ(tdvp), VTOZ(vp), + cnp->cn_nameptr, cnp->cn_cred, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + zfs_inactive(vp, ap->a_td->td_ucred, NULL); + return (0); +} + +#if __FreeBSD_version >= 1300042 +#ifndef _SYS_SYSPROTO_H_ +struct vop_need_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + bool need; + + if (!rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER)) + return (true); + need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + return (need); +} +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp != NULL); + +#if __FreeBSD_version < 1300042 + /* Destroy the vm object and flush associated pages. */ + vnode_destroy_vobject(vp); +#endif + /* + * z_teardown_inactive_lock protects from a race with + * zfs_znode_dmu_fini in zfsvfs_teardown during + * force unmount. + */ + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) + zfs_znode_free(zp); + else + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + vp->v_data = NULL; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfs_freebsd_fid(struct vop_fid_args *ap) +{ + + return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); +} + + +#ifndef _SYS_SYSPROTO_H_ +struct vop_pathconf_args { + struct vnode *a_vp; + int a_name; + register_t *a_retval; +} *ap; +#endif + +static int +zfs_freebsd_pathconf(struct vop_pathconf_args *ap) +{ + ulong_t val; + int error; + + error = zfs_pathconf(ap->a_vp, ap->a_name, &val, + curthread->td_ucred, NULL); + if (error == 0) { + *ap->a_retval = val; + return (error); + } + if (error != EOPNOTSUPP) + return (error); + + switch (ap->a_name) { + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + case _PC_PIPE_BUF: + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { + *ap->a_retval = PIPE_BUF; + return (0); + } + return (EINVAL); + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * FreeBSD's extended attributes namespace defines file name prefix for ZFS' + * extended attribute name: + * + * NAMESPACE PREFIX + * system freebsd:system: + * user (none, can be used to access ZFS fsattr(5) attributes + * created on Solaris) + */ +static int +zfs_create_attrname(int attrnamespace, const char *name, char *attrname, + size_t size) +{ + const char *namespace, *prefix, *suffix; + + /* We don't allow '/' character in attribute name. */ + if (strchr(name, '/') != NULL) + return (EINVAL); + /* We don't allow attribute names that start with "freebsd:" string. */ + if (strncmp(name, "freebsd:", 8) == 0) + return (EINVAL); + + bzero(attrname, size); + + switch (attrnamespace) { + case EXTATTR_NAMESPACE_USER: +#if 0 + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_USER_STRING; + suffix = ":"; +#else + /* + * This is the default namespace by which we can access all + * attributes created on Solaris. + */ + prefix = namespace = suffix = ""; +#endif + break; + case EXTATTR_NAMESPACE_SYSTEM: + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; + suffix = ":"; + break; + case EXTATTR_NAMESPACE_EMPTY: + default: + return (EINVAL); + } + if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, + name) >= size) { + return (ENAMETOOLONG); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operating to retrieve a named extended attribute. + */ +static int +zfs_getextattr(struct vop_getextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FREAD; + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + if (ap->a_size != NULL) { + error = VOP_GETATTR(vp, &va, ap->a_cred); + if (error == 0) + *ap->a_size = (size_t)va.va_size; + } else if (ap->a_uio != NULL) + error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to remove a named attribute. + */ +int +zfs_deleteextattr(struct vop_deleteextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + vnode_t *xvp = NULL, *vp; + int error; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, + UIO_SYSSPACE, attrname, xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + if (error != 0) { + ZFS_EXIT(zfsvfs); + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + NDFREE(&nd, NDF_ONLY_PNBUF); + + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to set a named attribute. + */ +static int +zfs_setextattr(struct vop_setextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FFLAGS(O_WRONLY | O_CREAT); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, + NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + VATTR_NULL(&va); + va.va_size = 0; + error = VOP_SETATTR(vp, &va, ap->a_cred); + if (error == 0) + VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +zfs_listextattr(struct vop_listextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrprefix[16]; + uint8_t dirbuf[sizeof (struct dirent)]; + struct dirent *dp; + struct iovec aiov; + struct uio auio, *uio = ap->a_uio; + size_t *sizep = ap->a_size; + size_t plen; + vnode_t *xvp = NULL, *vp; + int done, error, eof, pos; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, + sizeof (attrprefix)); + if (error != 0) + return (error); + plen = strlen(attrprefix); + + ZFS_ENTER(zfsvfs); + + if (sizep != NULL) + *sizep = 0; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + /* + * ENOATTR means that the EA directory does not yet exist, + * i.e. there are no extended attributes there. + */ + if (error == ENOATTR) + error = 0; + return (error); + } + + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, + UIO_SYSSPACE, ".", xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + + do { + uint8_t nlen; + + aiov.iov_base = (void *)dirbuf; + aiov.iov_len = sizeof (dirbuf); + auio.uio_resid = sizeof (dirbuf); + error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); + done = sizeof (dirbuf) - auio.uio_resid; + if (error != 0) + break; + for (pos = 0; pos < done; ) { + dp = (struct dirent *)(dirbuf + pos); + pos += dp->d_reclen; + /* + * XXX: Temporarily we also accept DT_UNKNOWN, as this + * is what we get when attribute was created on Solaris. + */ + if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) + continue; + if (plen == 0 && + strncmp(dp->d_name, "freebsd:", 8) == 0) + continue; + else if (strncmp(dp->d_name, attrprefix, plen) != 0) + continue; + nlen = dp->d_namlen - plen; + if (sizep != NULL) + *sizep += 1 + nlen; + else if (uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, uio->uio_rw, uio); + if (error == 0) { + error = uiomove(dp->d_name + plen, nlen, + uio->uio_rw, uio); + } + if (error != 0) + break; + } + } + } while (!eof && error == 0); + + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +int +zfs_freebsd_getacl(struct vop_getacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; + if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))) + return (error); + + error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt); + if (vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +int +zfs_freebsd_setacl(struct vop_setacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + int aclbsize; /* size of acl list in bytes */ + aclent_t *aaclp; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + if (ap->a_aclp == NULL) + return (EINVAL); + + if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) + return (EINVAL); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries, + * splitting every entry into two and appending "canonical six" + * entries at the end. Don't allow for setting an ACL that would + * cause chmod(2) to run out of ACL entries. + */ + if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) + return (ENOSPC); + + error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); + if (error != 0) + return (error); + + vsecattr.vsa_mask = VSA_ACE; + aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + + aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); + error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); + kmem_free(aaclp, aclbsize); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_aclcheck_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +int +zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) +{ + + return (EOPNOTSUPP); +} + +static int +zfs_vptocnp(struct vop_vptocnp_args *ap) +{ + vnode_t *covered_vp; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *zp = VTOZ(vp); + int ltype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are a snapshot mounted under .zfs, run the operation + * on the covered vnode. + */ + if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { + char name[MAXNAMLEN + 1]; + znode_t *dzp; + size_t len; + + error = zfs_znode_parent_and_name(zp, &dzp, name); + if (error == 0) { + len = strlen(name); + if (*ap->a_buflen < len) + error = SET_ERROR(ENOMEM); + } + if (error == 0) { + *ap->a_buflen -= len; + bcopy(name, ap->a_buf + *ap->a_buflen, len); + *ap->a_vpp = ZTOV(dzp); + } + ZFS_EXIT(zfsvfs); + return (error); + } + ZFS_EXIT(zfsvfs); + + covered_vp = vp->v_mount->mnt_vnodecovered; +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(covered_vp); +#else + vhold(covered_vp); +#endif + ltype = VOP_ISLOCKED(vp); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300045 + error = vget_finish(covered_vp, LK_SHARED, vs); +#else + error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); +#endif + if (error == 0) { + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); + vput(covered_vp); + } + vn_lock(vp, ltype | LK_RETRY); + if (VN_IS_DOOMED(vp)) + error = SET_ERROR(ENOENT); + return (error); +} + +#ifdef DIAGNOSTIC +#ifndef _SYS_SYSPROTO_H_ +struct vop_lock1_args { + struct vnode *a_vp; + int a_flags; + char *file; + int line; +}; +#endif + +static int +zfs_lock(struct vop_lock1_args *ap) +{ + vnode_t *vp; + znode_t *zp; + int err; + +#if __FreeBSD_version >= 1300064 + err = vop_lock(ap); +#else + err = vop_stdlock(ap); +#endif + if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { + vp = ap->a_vp; + zp = vp->v_data; + if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) && + zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); + } + return (err); +} +#endif + +struct vop_vector zfs_vnodeops; +struct vop_vector zfs_fifoops; +struct vop_vector zfs_shareops; + +struct vop_vector zfs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_inactive = zfs_freebsd_inactive, +#if __FreeBSD_version >= 1300042 + .vop_need_inactive = zfs_freebsd_need_inactive, +#endif + .vop_reclaim = zfs_freebsd_reclaim, + .vop_access = zfs_freebsd_access, + .vop_allocate = VOP_EINVAL, + .vop_lookup = zfs_cache_lookup, + .vop_cachedlookup = zfs_freebsd_cachedlookup, + .vop_getattr = zfs_freebsd_getattr, + .vop_setattr = zfs_freebsd_setattr, + .vop_create = zfs_freebsd_create, + .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, + .vop_mkdir = zfs_freebsd_mkdir, + .vop_readdir = zfs_freebsd_readdir, + .vop_fsync = zfs_freebsd_fsync, + .vop_open = zfs_freebsd_open, + .vop_close = zfs_freebsd_close, + .vop_rmdir = zfs_freebsd_rmdir, + .vop_ioctl = zfs_freebsd_ioctl, + .vop_link = zfs_freebsd_link, + .vop_symlink = zfs_freebsd_symlink, + .vop_readlink = zfs_freebsd_readlink, + .vop_read = zfs_freebsd_read, + .vop_write = zfs_freebsd_write, + .vop_remove = zfs_freebsd_remove, + .vop_rename = zfs_freebsd_rename, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_bmap = zfs_freebsd_bmap, + .vop_fid = zfs_freebsd_fid, + .vop_getextattr = zfs_getextattr, + .vop_deleteextattr = zfs_deleteextattr, + .vop_setextattr = zfs_setextattr, + .vop_listextattr = zfs_listextattr, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_getpages = zfs_freebsd_getpages, + .vop_putpages = zfs_freebsd_putpages, + .vop_vptocnp = zfs_vptocnp, +#if __FreeBSD_version >= 1300064 +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#else + .vop_lock1 = vop_lock, +#endif + .vop_unlock = vop_unlock, + .vop_islocked = vop_islocked, +#else +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#endif +#endif +}; +VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); + +struct vop_vector zfs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = zfs_freebsd_fsync, + .vop_access = zfs_freebsd_access, + .vop_getattr = zfs_freebsd_getattr, + .vop_inactive = zfs_freebsd_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_setattr = zfs_freebsd_setattr, + .vop_write = VOP_PANIC, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_fid = zfs_freebsd_fid, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, +}; +VFS_VOP_VECTOR_REGISTER(zfs_fifoops); + +/* + * special share hidden files vnode operations template + */ +struct vop_vector zfs_shareops = { + .vop_default = &default_vnodeops, + .vop_access = zfs_freebsd_access, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_fid = zfs_freebsd_fid, + .vop_pathconf = zfs_freebsd_pathconf, +}; +VFS_VOP_VECTOR_REGISTER(zfs_shareops); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c new file mode 100644 index 000000000..b9d5472b2 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -0,0 +1,1987 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska <[email protected]> */ + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/mntent.h> +#include <sys/u8_textprep.h> +#include <sys/dsl_dataset.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/atomic.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_rlock.h> +#include <sys/zfs_fuid.h> +#include <sys/dnode.h> +#include <sys/fs/zfs.h> +#endif /* _KERNEL */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/refcount.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> +#include <sys/sa.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_stat.h> +#include <sys/refcount.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, + SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); + +/* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +/* + * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to + * be freed before it can be safely accessed. + */ +krwlock_t zfsvfs_lock; + +static kmem_cache_t *znode_cache = NULL; + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + POINTER_INVALIDATE(&zp->z_zfsvfs); + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + zp->z_acl_cached = NULL; + zp->z_vnode = NULL; + zp->z_moved = 0; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_acl_lock); + zfs_rangelock_fini(&zp->z_rangelock); + + ASSERT(zp->z_acl_cached == NULL); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + // kmem_cache_set_move(znode_cache, zfs_znode_move); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + rw_destroy(&zfsvfs_lock); +} + + +static int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + znode_t *zp; + int error; + + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_moved = 0; + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + sa_handle_destroy(sharezp->z_sa_hdl); + kmem_cache_free(znode_cache, sharezp); + + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + /* + * Slap on VROOT if we are the root znode unless we are the root + * node of a snapshot mounted under .zfs. + */ + if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) + ZTOV(zp)->v_flag |= VROOT; + + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +static void +zfs_vnode_forget(vnode_t *vp) +{ + + /* copied from insmntque_stddtr */ + vp->v_data = NULL; + vp->v_op = &dead_vnodeops; + vgone(vp); + vput(vp); +} + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + vnode_t *vp; + uint64_t mode; + uint64_t parent; +#ifdef notyet + uint64_t mtime[2], ctime[2]; +#endif + uint64_t projid = ZFS_DEFAULT_PROJID; + sa_bulk_attr_t bulk[9]; + int count = 0; + int error; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + +#if __FreeBSD_version >= 1300076 + KASSERT(curthread->td_vp_reserved != NULL, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#else + KASSERT(curthread->td_vp_reserv > 0, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#endif + error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); + if (error != 0) { + kmem_cache_free(znode_cache, zp); + return (NULL); + } + zp->z_vnode = vp; + vp->v_data = zp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + vp = ZTOV(zp); + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); +#ifdef notyet + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); +#endif + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zfs_vnode_forget(vp); + zp->z_vnode = NULL; + kmem_cache_free(znode_cache, zp); + return (NULL); + } + + zp->z_projid = projid; + zp->z_mode = mode; + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; + + vp->v_type = IFTOVT((mode_t)mode); + + switch (vp->v_type) { + case VDIR: + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VFIFO: + vp->v_op = &zfs_fifoops; + break; + case VREG: + if (parent == zfsvfs->z_shares_dir) { + ASSERT(zp->z_uid == 0 && zp->z_gid == 0); + vp->v_op = &zfs_shareops; + } + break; + default: + break; + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Acquire vnode lock before making it available to the world. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VN_LOCK_AREC(vp); + if (vp->v_type != VFIFO) + VN_LOCK_ASHARE(vp); + + return (zp); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int err; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + + ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE)); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + vfs_timestamp(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + /* + * No execs denied will be deterimed when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + ASSERT(*zpp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + if (!(flag & IS_ROOT_NODE)) { + vnode_t *vp; + + vp = ZTOV(*zpp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + vnode_t *vp; + sa_handle_t *hdl; + struct thread *td; + int locked; + int err; + + td = curthread; + getnewvnode_reserve_(); +again: + *zpp = NULL; + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + err = SET_ERROR(ENOENT); + } else { + vp = ZTOV(zp); + /* + * Don't let the vnode disappear after + * ZFS_OBJ_HOLD_EXIT. + */ + VN_HOLD(vp); + *zpp = zp; + err = 0; + } + + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err) { + getnewvnode_drop_reserve(); + return (err); + } + + locked = VOP_ISLOCKED(vp); + VI_LOCK(vp); + if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { + /* + * The vnode is doomed and this thread doesn't + * hold the exclusive lock on it, so the vnode + * must be being reclaimed by another thread. + * Otherwise the doomed vnode is being reclaimed + * by this thread and zfs_zget is called from + * ZIL internals. + */ + VI_UNLOCK(vp); + + /* + * XXX vrele() locks the vnode when the last reference + * is dropped. Although in this case the vnode is + * doomed / dead and so no inactivation is required, + * the vnode lock is still acquired. That could result + * in a LOR with z_teardown_lock if another thread holds + * the vnode's lock and tries to take z_teardown_lock. + * But that is only possible if the other thread peforms + * a ZFS vnode operation on the vnode. That either + * should not happen if the vnode is dead or the thread + * should also have a refrence to the vnode and thus + * our reference is not last. + */ + VN_RELE(vp); + goto again; + } + VI_UNLOCK(vp); + getnewvnode_drop_reserve(); + return (err); + } + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + if (err == 0) { + vnode_t *vp = ZTOV(zp); + + err = insmntque(vp, zfsvfs->z_vfs); + if (err == 0) { + vp->v_hash = obj_num; + VOP_UNLOCK1(vp); + } else { + zp->z_vnode = NULL; + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + *zpp = NULL; + } + } + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + vnode_t *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + + /* + * Remove cached pages before reloading the znode, so that they are not + * lingering after we run into any error. Ideally, we should vgone() + * the vnode in case of error, but currently we cannot do that + * because of the LOR between the vnode lock and z_teardown_lock. + * So, instead, we have to "doom" the znode in the illumos style. + */ + vp = ZTOV(zp); + vn_pages_remove(vp, 0, 0); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + mutex_exit(&zp->z_acl_lock); + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * It is highly improbable but still quite possible that two + * objects in different datasets are created with the same + * object numbers and in transaction groups with the same + * numbers. znodes corresponding to those objects would + * have the same z_id and z_gen, but their other attributes + * may be different. + * zfs recv -F may replace one of such objects with the other. + * As a result file properties recorded in the replaced + * object's vnode may no longer match the received object's + * properties. At present the only cached property is the + * files type recorded in v_type. + * So, handle this case by leaving the old vnode and znode + * disassociated from the actual object. A new vnode and a + * znode will be created if the object is accessed + * (e.g. via a look-up). The old vnode and znode will be + * recycled when the last vnode reference is dropped. + */ + if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatical removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (0); + } + + zp->z_blksz = doi.doi_data_block_size; + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_rmnode(zp); + return; + } + } + + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_sa_hdl == NULL); + zp->z_vnode = NULL; + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes--; + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); + +} + +void +zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + vfs_timestamp(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & AT_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & AT_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); +} +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file, so this code path should + * never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); + } + + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int error; + int i; + znode_t *rootzp = NULL; + zfsvfs_t *zfsvfs; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + rootzp->z_zfsvfs = zfsvfs; + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + kmem_cache_free(znode_cache, rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT(error == 0); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, "<xattrdir>"); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +#ifdef _KERNEL +int +zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t parent; + int is_xattrdir; + int err; + + /* Extended attributes should not be visible as regular files. */ + if ((zp->z_pflags & ZFS_XATTR) != 0) + return (SET_ERROR(EINVAL)); + + err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, + &parent, &is_xattrdir); + if (err != 0) + return (err); + ASSERT0(is_xattrdir); + + /* No name as this is a root object. */ + if (parent == zp->z_id) + return (SET_ERROR(EINVAL)); + + err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), buf); + if (err != 0) + return (err); + err = zfs_zget(zfsvfs, parent, dzpp); + return (err); +} +#endif /* _KERNEL */ diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c new file mode 100644 index 000000000..d6496a7ef --- /dev/null +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -0,0 +1,1882 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include <sys/zio_crypt.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dnode.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/zil.h> +#include <sys/sha2.h> +#include <sys/hkdf.h> + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authentication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearranged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpreted. + * + * At the objset level, we maintain 2 separate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintained in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaintext as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC + * here so that a reproducible checksum of the plaintext is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +/* + * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many + * calls, to test decryption error handling code paths. + */ +uint64_t zio_decrypt_fail_fraction = 0; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +static void +zio_crypt_key_destroy_early(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + bzero(&key->zk_session, sizeof (key->zk_session)); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + + freebsd_crypt_freesession(&key->zk_session); + zio_crypt_key_destroy_early(key); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech __unused; + uint_t keydata_len; + zio_crypt_info_t *ci = NULL; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + ret = freebsd_crypt_newsession(&key->zk_session, ci, + &key->zk_current_key); + if (ret) + goto error; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech __unused; + + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + freebsd_crypt_freesession(&key->zk_session); + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[key->zk_crypt], &key->zk_current_key); + if (ret != 0) + goto out_unlock; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int failed_decrypt_size; + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the ciphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +/* + * The implemenation for FreeBSD's OpenCrypto. + * + * The big difference between ICP and FOC is that FOC uses a single + * buffer for input and output. This means that (for AES-GCM, the + * only one supported right now) the source must be copied into the + * destination, and the destination must have the AAD, and the tag/MAC, + * already associated with it. (Both implementations can use a uio.) + * + * Since the auth data is part of the iovec array, all we need to know + * is the length: 0 means there's no AAD. + * + */ +static int +zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, + uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, + uio_t *uio, uint_t auth_len) +{ + zio_crypt_info_t *ci; + int ret; + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + + ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, + datalen, auth_len); + if (ret != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Returning error %s\n", + __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); +#endif + ret = SET_ERROR(encrypt ? EIO : ECKSUM); + } + + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + uio_t cuio; + iovec_t iovecs[4]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* + * Since we only support one buffer, we need to copy + * the plain text (source) to the cipher buffer (dest). + * We set iovecs[0] -- the authentication data -- below. + */ + bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len); + bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out, + SHA512_HMAC_KEYLEN); + iovecs[1].iov_base = keydata_out; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = hmac_keydata_out; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + + cuio.uio_iov = iovecs; + cuio.uio_iovcnt = 4; + cuio.uio_segflg = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + uio_t cuio; + iovec_t iovecs[4]; + void *src, *dst; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* + * Since we only support one buffer, we need to copy + * the encrypted buffer (source) to the plain buffer + * (dest). We set iovecs[0] -- the authentication data -- + * below. + */ + dst = key->zk_master_keydata; + src = keydata; + + bcopy(src, dst, keydata_len); + + dst = key->zk_hmac_keydata; + src = hmac_keydata; + bcopy(src, dst, SHA512_HMAC_KEYLEN); + + iovecs[1].iov_base = key->zk_master_keydata; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = key->zk_hmac_keydata; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + + cuio.uio_iov = iovecs; + cuio.uio_iovcnt = 4; + cuio.uio_segflg = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[crypt], &key->zk_current_key); + if (ret != 0) + goto error; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + crypto_mac(&key->zk_hmac_key, data, datalen, + raw_digestbuf, SHA512_DIGEST_LENGTH); + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + int avoidlint = SPA_MINBLOCKSIZE; + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, avoidlint); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpreted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, avoidlint); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + crypto_mac_update(ctx, &bab, bab_len); + + return (0); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 separate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +/* ARGSUSED */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + struct hmac_ctx hash_ctx; + struct hmac_ctx *ctx = &hash_ctx; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + + /* calculate the portable MAC from the portable fields and metadnode */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user, group and project accounting. + * If these objects are not present, the local MAC is zeroed out. + */ + if ((datalen >= OBJSET_PHYS_SIZE_V3 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE && + osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || + (datalen >= OBJSET_PHYS_SIZE_V2 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_PHYS_SIZE_V1)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* XXX check dnode type ... */ + /* add in fields from the user accounting dnodes */ + if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + } + + if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + } + + if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && + datalen >= OBJSET_PHYS_SIZE_V3) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_projectused_dnode); + if (ret) + goto error; + } + + crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ + if (uio->uio_iov) + kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); +#endif + return (SET_ERROR(ECKSUM)); + } + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +/* + * The OpenCrypto used in FreeBSD does not use seperate source and + * destination buffers; instead, the same buffer is used. Further, to + * accomodate some of the drivers, the authbuf needs to be logically before + * the data. This means that we need to copy the source to the destination, + * and set up an extra iovec_t at the beginning to handle the authbuf. + * It also means we'll only return one uio_t, which we do via the clumsy + * ifdef in the function declaration. + */ + +/* ARGSUSED */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, + uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* + * We need at least two iovecs -- one for the AAD, + * one for the MAC. + */ + bcopy(src, dst, datalen); + nr_dst = 2; + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src = 0; + nr_dst += nr_iovecs; + + /* allocate the iovec arrays */ + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(src_iovecs, nr_src * sizeof (iovec_t)); + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(dst_iovecs, nr_dst * sizeof (iovec_t)); + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + /* The first one will contain the authbuf */ + nr_iovecs = 1; + + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + ASSERT3P(dst_iovecs, !=, NULL); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_base = (char *) + dlrp + sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + + out_uio->uio_iov = dst_iovecs; + out_uio->uio_iovcnt = nr_dst; + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t *puio, uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + bcopy(src, dst, datalen); + nr_dst = 2; + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src = 0; + nr_dst += nr_iovecs; + + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(src_iovecs, nr_src * sizeof (iovec_t)); + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(dst_iovecs, nr_dst * sizeof (iovec_t)); + } + + nr_iovecs = 1; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_dst); + ASSERT3P(dst_iovecs, !=, NULL); + dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + out_uio->uio_iov = dst_iovecs; + out_uio->uio_iovcnt = nr_dst; + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* ARGSUSED */ +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *out_uio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + void *src, *dst; + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + } else { + src = cipherbuf; + dst = plainbuf; + } + bcopy(src, dst, datalen); + cipher_iovecs[0].iov_base = dst; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + out_uio->uio_iov = cipher_iovecs; + out_uio->uio_iovcnt = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = (void *)mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int faile_decrypt_size; + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + uio_t puio, cuio; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + freebsd_crypt_session_t *tmpl = NULL; + uint8_t *authbuf = NULL; + + bzero(&puio, sizeof (uio_t)); + bzero(&cuio, sizeof (uio_t)); + +#ifdef FCRYPTO_DEBUG + printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", + __FUNCTION__, + encrypt ? "encrypt" : "decrypt", + key, salt, ot, iv, mac, datalen, + byteswap ? "byteswap" : "native_endian", plainbuf, + cipherbuf, no_crypt); + + printf("\tkey = {"); + for (int i = 0; i < key->zk_current_key.ck_length/8; i++) + printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); + printf("}\n"); +#endif + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = &key->zk_session; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, + ckey, iv, enc_len, &cuio, auth_len); + if (ret != 0) + goto error; + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (!encrypt) { + if (failed_decrypt_buf != NULL) + kmem_free(failed_decrypt_buf, failed_decrypt_size); + failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); + failed_decrypt_size = datalen; + bcopy(cipherbuf, failed_decrypt_buf, datalen); + } + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + return (SET_ERROR(ret)); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (SET_ERROR(ret)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c new file mode 100644 index 000000000..bef97a9b3 --- /dev/null +++ b/module/os/freebsd/zfs/zvol_os.c @@ -0,0 +1,1476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2006-2010 Pawel Jakub Dawidek <[email protected]> + * All rights reserved. + * + * Portions Copyright 2010 Robert Milkowski + * + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2011 Martin Matuska <[email protected]> */ + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol/<pool_name>/<dataset_name> + * + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + * + * On FreeBSD ZVOLs are simply GEOM providers like any other storage device + * in the system. Except when they're simply character devices (volmode=dev). + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/disk.h> +#include <sys/dmu_traverse.h> +#include <sys/dnode.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/byteorder.h> +#include <sys/sunddi.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/queue.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ioctl.h> +#include <sys/zil.h> +#include <sys/refcount.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_rlock.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_raidz.h> +#include <sys/zvol.h> +#include <sys/zil_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/zfeature.h> +#include <sys/zio_checksum.h> +#include <sys/zil_impl.h> +#include <sys/filio.h> + +#include <geom/geom.h> +#include <sys/zvol.h> +#include <sys/zvol_impl.h> + +#include "zfs_namecheck.h" + +#define ZVOL_DUMPSIZE "dumpsize" + +#ifdef ZVOL_LOCK_DEBUG +#define ZVOL_RW_READER RW_WRITER +#define ZVOL_RW_READ_HELD RW_WRITE_HELD +#else +#define ZVOL_RW_READER RW_READER +#define ZVOL_RW_READ_HELD RW_READ_HELD +#endif + +enum zvol_geom_state { + ZVOL_GEOM_UNINIT, + ZVOL_GEOM_STOPPED, + ZVOL_GEOM_RUNNING, +}; + +struct zvol_state_os { + int zso_volmode; +#define zso_dev _zso_state._zso_dev +#define zso_geom _zso_state._zso_geom + union { + /* volmode=dev */ + struct zvol_state_dev { + struct cdev *zsd_cdev; + uint64_t zsd_sync_cnt; + } _zso_dev; + + /* volmode=geom */ + struct zvol_state_geom { + struct g_provider *zsg_provider; + struct bio_queue_head zsg_queue; + struct mtx zsg_queue_mtx; + enum zvol_geom_state zsg_state; + } _zso_geom; + } _zso_state; +}; + +struct proc *zfsproc; + +static uint32_t zvol_minors; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, + "Expose as GEOM providers (1), device files (2) or neither"); +static boolean_t zpool_on_zvol = B_FALSE; +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, + "Allow zpools to use zvols as vdevs (DANGEROUS)"); + +/* + * Toggle unmap functionality. + */ +boolean_t zvol_unmap_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, + &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS / 2; + +static void zvol_ensure_zilog(zvol_state_t *zv); + +static d_open_t zvol_cdev_open; +static d_close_t zvol_cdev_close; +static d_ioctl_t zvol_cdev_ioctl; +static d_read_t zvol_cdev_read; +static d_write_t zvol_cdev_write; +static d_strategy_t zvol_geom_bio_strategy; + +static struct cdevsw zvol_cdevsw = { + .d_name = "zvol", + .d_version = D_VERSION, + .d_flags = D_DISK | D_TRACKCLOSE, + .d_open = zvol_cdev_open, + .d_close = zvol_cdev_close, + .d_ioctl = zvol_cdev_ioctl, + .d_read = zvol_cdev_read, + .d_write = zvol_cdev_write, + .d_strategy = zvol_geom_bio_strategy, +}; + +extern uint_t zfs_geom_probe_vdev_key; + +struct g_class zfs_zvol_class = { + .name = "ZFS::ZVOL", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); + +static int zvol_geom_open(struct g_provider *pp, int flag, int count); +static int zvol_geom_close(struct g_provider *pp, int flag, int count); +static void zvol_geom_run(zvol_state_t *zv); +static void zvol_geom_destroy(zvol_state_t *zv); +static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); +static void zvol_geom_worker(void *arg); +static void zvol_geom_bio_start(struct bio *bp); +static int zvol_geom_bio_getattr(struct bio *bp); +static void zvol_geom_bio_check_zilog(struct bio *bp); +/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ + +/* + * GEOM mode implementation + */ + +/*ARGSUSED*/ +static int +zvol_geom_open(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + int err = 0; + boolean_t drop_suspend = B_TRUE; + + if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { + /* + * if zfs_geom_probe_vdev_key is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (SET_ERROR(EOPNOTSUPP)); + } + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flag & FWRITE)); + if (err) + goto out_mutex; + pp->mediasize = zv->zv_volsize; + pp->stripeoffset = 0; + pp->stripesize = zv->zv_volblocksize; + } + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || + dmu_objset_incompatible_encryption_version(zv->zv_objset))) { + err = EROFS; + goto out_open_count; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = EBUSY; + goto out_open_count; + } +#ifdef FEXCL + if (flag & FEXCL) { + if (zv->zv_open_count != 0) { + err = EBUSY; + goto out_open_count; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count += count; + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); +out_mutex: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(err)); +} + +/*ARGSUSED*/ +static int +zvol_geom_close(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_open_count == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count > 0); + + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if ((zv->zv_open_count - count) == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count -= count; + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static void +zvol_geom_run(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + g_error_provider(pp, 0); + + kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0, + "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); +} + +static void +zvol_geom_destroy(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + g_topology_assert(); + + mutex_enter(&zv->zv_state_lock); + VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); + mutex_exit(&zv->zv_state_lock); + zsg->zsg_provider = NULL; + pp->private = NULL; + g_wither_geom(pp->geom, ENXIO); +} + +static int +zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) +{ + int count, error, flags; + + g_topology_assert(); + + /* + * To make it easier we expect either open or close, but not both + * at the same time. + */ + KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || + (acr <= 0 && acw <= 0 && ace <= 0), + ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", + pp->name, acr, acw, ace)); + + if (pp->private == NULL) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + return (pp->error); + } + + /* + * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if + * ace != 0, because GEOM already handles that and handles it a bit + * differently. GEOM allows for multiple read/exclusive consumers and + * ZFS allows only one exclusive consumer, no matter if it is reader or + * writer. I like better the way GEOM works so I'll leave it for GEOM + * to decide what to do. + */ + + count = acr + acw + ace; + if (count == 0) + return (0); + + flags = 0; + if (acr != 0 || ace != 0) + flags |= FREAD; + if (acw != 0) + flags |= FWRITE; + + g_topology_unlock(); + if (count > 0) + error = zvol_geom_open(pp, flags, count); + else + error = zvol_geom_close(pp, flags, -count); + g_topology_lock(); + return (error); +} + +static void +zvol_geom_worker(void *arg) +{ + zvol_state_t *zv = arg; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct bio *bp; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + thread_lock(curthread); + sched_prio(curthread, PRIBIO); + thread_unlock(curthread); + + for (;;) { + mtx_lock(&zsg->zsg_queue_mtx); + bp = bioq_takefirst(&zsg->zsg_queue); + if (bp == NULL) { + if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { + zsg->zsg_state = ZVOL_GEOM_RUNNING; + wakeup(&zsg->zsg_state); + mtx_unlock(&zsg->zsg_queue_mtx); + kthread_exit(); + } + msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, + PRIBIO | PDROP, "zvol:io", 0); + continue; + } + mtx_unlock(&zsg->zsg_queue_mtx); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_geom_bio_check_zilog(bp); + switch (bp->bio_cmd) { + case BIO_FLUSH: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + break; + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + zvol_geom_bio_strategy(bp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + rw_exit(&zv->zv_suspend_lock); + } +} + +static void +zvol_geom_bio_start(struct bio *bp) +{ + zvol_state_t *zv = bp->bio_to->private; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + boolean_t first; + + if (bp->bio_cmd == BIO_GETATTR) { + if (zvol_geom_bio_getattr(bp)) + g_io_deliver(bp, EOPNOTSUPP); + return; + } + + if (!THREAD_CAN_SLEEP()) { + mtx_lock(&zsg->zsg_queue_mtx); + first = (bioq_first(&zsg->zsg_queue) == NULL); + bioq_insert_tail(&zsg->zsg_queue, bp); + mtx_unlock(&zsg->zsg_queue_mtx); + if (first) + wakeup_one(&zsg->zsg_queue); + return; + } + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_geom_bio_check_zilog(bp); + + switch (bp->bio_cmd) { + case BIO_FLUSH: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + break; + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + zvol_geom_bio_strategy(bp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + rw_exit(&zv->zv_suspend_lock); +} + +static int +zvol_geom_bio_getattr(struct bio *bp) +{ + zvol_state_t *zv; + + zv = bp->bio_to->private; + ASSERT(zv != NULL); + + spa_t *spa = dmu_objset_spa(zv->zv_objset); + uint64_t refd, avail, usedobjs, availobjs; + + if (g_handleattr_int(bp, "GEOM::candelete", 1)) + return (0); + if (strcmp(bp->bio_attribute, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksavail", + avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) + return (0); + } + return (1); +} + +static void +zvol_geom_bio_check_zilog(struct bio *bp) +{ + zvol_state_t *zv; + + zv = bp->bio_to->private; + ASSERT(zv != NULL); + + switch (bp->bio_cmd) { + case BIO_FLUSH: + case BIO_WRITE: + case BIO_DELETE: + zvol_ensure_zilog(zv); + default: + break; + } +} + +static void +zvol_geom_bio_strategy(struct bio *bp) +{ + zvol_state_t *zv; + uint64_t off, volsize; + size_t resid; + char *addr; + objset_t *os; + zfs_locked_range_t *lr; + int error = 0; + boolean_t doread = 0; + boolean_t is_dumpified; + boolean_t sync; + + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + goto out; + } + + if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { + error = SET_ERROR(EROFS); + goto out; + } + + switch (bp->bio_cmd) { + case BIO_FLUSH: + goto sync; + case BIO_READ: + doread = 1; + case BIO_WRITE: + case BIO_DELETE: + break; + default: + error = EOPNOTSUPP; + goto out; + } + + off = bp->bio_offset; + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT(os != NULL); + + addr = bp->bio_data; + resid = bp->bio_length; + + if (resid > 0 && (off < 0 || off >= volsize)) { + error = SET_ERROR(EIO); + goto out; + } + + is_dumpified = B_FALSE; + sync = !doread && !is_dumpified && + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + */ + lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, + doread ? RL_READER : RL_WRITER); + + if (bp->bio_cmd == BIO_DELETE) { + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, off, resid, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + off, resid); + resid = 0; + } + goto unlock; + } + while (resid != 0 && off < volsize) { + size_t size = MIN(resid, zvol_maxphys); + if (doread) { + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, sync); + dmu_tx_commit(tx); + } + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + off += size; + addr += size; + resid -= size; + } +unlock: + zfs_rangelock_exit(lr); + + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length && off > volsize) + error = EINVAL; + + if (sync) { +sync: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } +out: + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); +} + +/* + * Character device mode implementation + */ + +static int +zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + /* + * uio_loffset == volsize isn't an error as + * its required for EOF processing. + */ + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, + uio->uio_resid, RL_READER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio->uio_loffset) + bytes = volsize - uio->uio_loffset; + + error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_rangelock_exit(lr); + + return (error); +} + +static int +zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + boolean_t sync; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + + sync = (ioflag & IO_SYNC) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + rw_exit(&zv->zv_suspend_lock); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, + uio->uio_resid, RL_WRITER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio->uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes, sync); + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + return (error); +} + +static int +zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + int err = 0; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flags & FWRITE)); + if (err) + goto out_locked; + } + + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = EROFS; + goto out_opened; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = EBUSY; + goto out_opened; + } +#ifdef FEXCL + if (flags & FEXCL) { + if (zv->zv_open_count != 0) { + err = EBUSY; + goto out_opened; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count++; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt++; + if (zsd->zsd_sync_cnt == 1) + zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); + } + + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); + +out_opened: + if (zv->zv_open_count == 0) + zvol_last_close(zv); +out_locked: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(err)); +} + +static int +zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_open_count == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count > 0); + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 1) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count--; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt--; + } + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static int +zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, + int fflag, struct thread *td) +{ + zvol_state_t *zv; + zfs_locked_range_t *lr; + off_t offset, length; + int i, error; + boolean_t sync; + + zv = dev->si_drv2; + + error = 0; + KASSERT(zv->zv_open_count > 0, + ("Device with zero access count in %s", __func__)); + + i = IOCPARM_LEN(cmd); + switch (cmd) { + case DIOCGSECTORSIZE: + *(uint32_t *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + break; + case DIOCGFLUSH: + if (zv->zv_zilog != NULL) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + break; + case DIOCGDELETE: + if (!zvol_unmap_enabled) + break; + + offset = ((off_t *)data)[0]; + length = ((off_t *)data)[1]; + if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || + offset < 0 || offset >= zv->zv_volsize || + length <= 0) { + printf("%s: offset=%jd length=%jd\n", __func__, offset, + length); + error = EINVAL; + break; + } + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, + RL_WRITER); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + sync = FALSE; + dmu_tx_abort(tx); + } else { + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + zvol_log_truncate(zv, tx, offset, length, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + offset, length); + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + break; + case DIOCGSTRIPESIZE: + *(off_t *)data = zv->zv_volblocksize; + break; + case DIOCGSTRIPEOFFSET: + *(off_t *)data = 0; + break; + case DIOCGATTR: { + spa_t *spa = dmu_objset_spa(zv->zv_objset); + struct diocgattr_arg *arg = (struct diocgattr_arg *)data; + uint64_t refd, avail, usedobjs, availobjs; + + if (strcmp(arg->name, "GEOM::candelete") == 0) + arg->value.i = 1; + else if (strcmp(arg->name, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = refd / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc( + spa_normal_class(spa)); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + arg->value.off = refd / DEV_BSIZE; + } else + error = ENOIOCTL; + break; + } + case FIOSEEKHOLE: + case FIOSEEKDATA: { + off_t *off = (off_t *)data; + uint64_t noff; + boolean_t hole; + + hole = (cmd == FIOSEEKHOLE); + noff = *off; + error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); + *off = noff; + break; + } + default: + error = ENOIOCTL; + } + + return (error); +} + +/* + * Misc. helpers + */ + +static void +zvol_ensure_zilog(zvol_state_t *zv) +{ + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } +} + +static boolean_t +zvol_is_zvol_impl(const char *device) +{ + return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + struct g_geom *gp; + + g_topology_lock(); + gp = pp->geom; + ASSERT(gp != NULL); + + zsg->zsg_provider = NULL; + g_wither_provider(pp, ENXIO); + + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = zv->zv_volsize; + pp->private = zv; + zsg->zsg_provider = pp; + g_error_provider(pp, 0); + g_topology_unlock(); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + dev = zsd->zsd_cdev; + if (dev != NULL) { + destroy_dev(dev); + dev = zsd->zsd_cdev = NULL; + if (zv->zv_open_count > 0) { + zv->zv_flags &= ~ZVOL_EXCL; + zv->zv_open_count = 0; + /* XXX need suspend lock but lock order */ + zvol_last_close(zv); + } + } + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) + == 0) { + dev->si_iosize_max = MAXPHYS; + zsd->zsd_cdev = dev; + } + } + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); +} + +/* + * Remove minor node for the specified volume. + */ +static void +zvol_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count == 0); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + + g_topology_lock(); + zvol_geom_destroy(zv); + g_topology_unlock(); + mtx_destroy(&zsg->zsg_queue_mtx); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + if (dev != NULL) + destroy_dev(dev); + } + + mutex_destroy(&zv->zv_state_lock); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + zvol_minors--; +} + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +static int +zvol_create_minor_impl(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + uint64_t volmode, hash; + int error; + + ZFS_LOG(1, "Creating ZVOL %s...", name); + + hash = zvol_name_hash(name); + if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(EEXIST)); + } + + DROP_GIANT(); + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); + if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + /* + * zvol_alloc equivalent ... + */ + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + zv->zv_hash = hash; + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso->zso_volmode = volmode; + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp; + struct g_geom *gp; + + zsg->zsg_state = ZVOL_GEOM_UNINIT; + mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + /* TODO: NULL check? */ + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + bioq_init(&zsg->zsg_queue); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); + if (error != 0) { + mutex_destroy(&zv->zv_state_lock); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (*zv)); + dmu_objset_disown(os, B_TRUE, FTAG); + goto out_giant; + } + dev->si_iosize_max = MAXPHYS; + zsd->zsd_cdev = dev; + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + + if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + + /* XXX do prefetch */ + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (error == 0) + zvol_geom_run(zv); + g_topology_unlock(); + } +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + zvol_minors++; + rw_exit(&zvol_state_lock); + } + ZFS_LOG(1, "ZVOL %s created.", name); +out_giant: + PICKUP_GIANT(); + return (error); +} + +static void +zvol_clear_private(zvol_state_t *zv) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp == NULL) /* XXX when? */ + return; + + mtx_lock(&zsg->zsg_queue_mtx); + zsg->zsg_state = ZVOL_GEOM_STOPPED; + pp->private = NULL; + wakeup_one(&zsg->zsg_queue); + while (zsg->zsg_state != ZVOL_GEOM_RUNNING) + msleep(&zsg->zsg_state, + &zsg->zsg_queue_mtx, + 0, "zvol:w", 0); + mtx_unlock(&zsg->zsg_queue_mtx); + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + } +} + +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + zv->zv_volsize = volsize; + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp == NULL) /* XXX when? */ + return (0); + + g_topology_lock(); + + /* + * Do not invoke resize event when initial size was zero. + * ZVOL initializes the size on first open, this is not + * real resizing. + */ + if (pp->mediasize == 0) + pp->mediasize = zv->zv_volsize; + else + g_resize_provider(pp, zv->zv_volsize); + + g_topology_unlock(); + } + return (0); +} + +static void +zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) +{ + // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) +{ + // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +const static zvol_platform_ops_t zvol_freebsd_ops = { + .zv_free = zvol_free, + .zv_rename_minor = zvol_rename_minor, + .zv_create_minor = zvol_create_minor_impl, + .zv_update_volsize = zvol_update_volsize, + .zv_clear_private = zvol_clear_private, + .zv_is_zvol = zvol_is_zvol_impl, + .zv_set_disk_ro = zvol_set_disk_ro_impl, + .zv_set_capacity = zvol_set_capacity_impl, +}; + +/* + * Public interfaces + */ + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +int +zvol_init(void) +{ + zvol_init_impl(); + zvol_register_ops(&zvol_freebsd_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); +} |