summaryrefslogtreecommitdiffstats
path: root/zfs/lib/libdmu-ctl
diff options
context:
space:
mode:
Diffstat (limited to 'zfs/lib/libdmu-ctl')
-rw-r--r--zfs/lib/libdmu-ctl/dctl_client.c263
-rw-r--r--zfs/lib/libdmu-ctl/dctl_common.c109
-rw-r--r--zfs/lib/libdmu-ctl/dctl_server.c476
-rw-r--r--zfs/lib/libdmu-ctl/dctl_thrpool.c253
-rw-r--r--zfs/lib/libdmu-ctl/dmu_send.c1249
-rw-r--r--zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h71
-rw-r--r--zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h144
-rw-r--r--zfs/lib/libdmu-ctl/rrwlock.c249
-rw-r--r--zfs/lib/libdmu-ctl/zfs_acl.c2641
-rw-r--r--zfs/lib/libdmu-ctl/zfs_ctldir.c1147
-rw-r--r--zfs/lib/libdmu-ctl/zfs_dir.c968
-rw-r--r--zfs/lib/libdmu-ctl/zfs_fuid.c688
-rw-r--r--zfs/lib/libdmu-ctl/zfs_ioctl.c3055
-rw-r--r--zfs/lib/libdmu-ctl/zfs_log.c693
-rw-r--r--zfs/lib/libdmu-ctl/zfs_replay.c876
-rw-r--r--zfs/lib/libdmu-ctl/zfs_rlock.c602
-rw-r--r--zfs/lib/libdmu-ctl/zfs_vfsops.c1671
-rw-r--r--zfs/lib/libdmu-ctl/zfs_vnops.c4558
-rw-r--r--zfs/lib/libdmu-ctl/zvol.c1830
19 files changed, 0 insertions, 21543 deletions
diff --git a/zfs/lib/libdmu-ctl/dctl_client.c b/zfs/lib/libdmu-ctl/dctl_client.c
deleted file mode 100644
index e3d8f305b..000000000
--- a/zfs/lib/libdmu-ctl/dctl_client.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ftw.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/debug.h>
-
-#include <sys/dmu_ctl.h>
-#include <sys/dmu_ctl_impl.h>
-
-/*
- * Try to connect to the socket given in path.
- *
- * For nftw() convenience, returns 0 if unsuccessful, otherwise
- * returns the socket descriptor.
- */
-static int try_connect(const char *path)
-{
- struct sockaddr_un name;
- int sock;
-
- sock = socket(PF_UNIX, SOCK_STREAM, 0);
- if (sock == -1) {
- perror("socket");
- return 0;
- }
-
- /*
- * The socket fd cannot be 0 otherwise nftw() will not interpret the
- * return code correctly.
- */
- VERIFY(sock != 0);
-
- name.sun_family = AF_UNIX;
- strncpy(name.sun_path, path, sizeof(name.sun_path));
-
- name.sun_path[sizeof(name.sun_path) - 1] = '\0';
-
- if (connect(sock, (struct sockaddr *) &name, sizeof(name)) == -1) {
- close(sock);
- return 0;
- }
-
- return sock;
-}
-
-/*
- * nftw() callback.
- */
-static int nftw_cb(const char *fpath, const struct stat *sb, int typeflag,
- struct FTW *ftwbuf)
-{
- if (!S_ISSOCK(sb->st_mode))
- return 0;
-
- if (strcmp(&fpath[ftwbuf->base], SOCKNAME) != 0)
- return 0;
-
- return try_connect(fpath);
-}
-
-/*
- * For convenience, if check_subdirs is true we walk the directory tree to
- * find a good socket.
- */
-int dctlc_connect(const char *dir, boolean_t check_subdirs)
-{
- char *fpath;
- int fd;
-
- if (check_subdirs)
- fd = nftw(dir, nftw_cb, 10, FTW_PHYS);
- else {
- fpath = malloc(strlen(dir) + strlen(SOCKNAME) + 2);
- if (fpath == NULL)
- return -1;
-
- strcpy(fpath, dir);
- strcat(fpath, "/" SOCKNAME);
-
- fd = try_connect(fpath);
-
- free(fpath);
- }
-
- return fd == 0 ? -1 : fd;
-}
-
-void dctlc_disconnect(int fd)
-{
- (void) shutdown(fd, SHUT_RDWR);
-}
-
-static int dctl_reply_copyin(int fd, dctl_cmd_t *cmd)
-{
- return dctl_send_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
- cmd->u.dcmd_copy.size);
-}
-
-static int dctl_reply_copyinstr(int fd, dctl_cmd_t *cmd)
-{
- dctl_cmd_t reply;
- char *from;
- size_t len, buflen, to_copy;
- int error;
-
- reply.dcmd_msg = DCTL_GEN_REPLY;
-
- from = (char *)(uintptr_t) cmd->u.dcmd_copy.ptr;
-
- buflen = cmd->u.dcmd_copy.size;
- to_copy = strnlen(from, buflen - 1);
-
- reply.u.dcmd_reply.rc = from[to_copy] == '\0' ? 0 : ENAMETOOLONG;
- reply.u.dcmd_reply.size = to_copy;
-
- error = dctl_send_msg(fd, &reply);
-
- if (!error && to_copy > 0)
- error = dctl_send_data(fd, from, to_copy);
-
- return error;
-}
-
-static int dctl_reply_copyout(int fd, dctl_cmd_t *cmd)
-{
- return dctl_read_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
- cmd->u.dcmd_copy.size);
-}
-
-static int dctl_reply_fd_read(int fd, dctl_cmd_t *cmd)
-{
- dctl_cmd_t reply;
- void *buf;
- int error;
- ssize_t rrc, size = cmd->u.dcmd_fd_io.size;
-
- buf = malloc(size);
- if (buf == NULL)
- return ENOMEM;
-
- rrc = read(cmd->u.dcmd_fd_io.fd, buf, size);
-
- reply.dcmd_msg = DCTL_GEN_REPLY;
- reply.u.dcmd_reply.rc = rrc == -1 ? errno : 0;
- reply.u.dcmd_reply.size = rrc;
-
- error = dctl_send_msg(fd, &reply);
-
- if (!error && rrc > 0)
- error = dctl_send_data(fd, buf, rrc);
-
-out:
- free(buf);
-
- return error;
-}
-
-static int dctl_reply_fd_write(int fd, dctl_cmd_t *cmd)
-{
- dctl_cmd_t reply;
- void *buf;
- int error;
- ssize_t wrc, size = cmd->u.dcmd_fd_io.size;
-
- buf = malloc(size);
- if (buf == NULL)
- return ENOMEM;
-
- error = dctl_read_data(fd, buf, size);
- if (error)
- goto out;
-
- wrc = write(cmd->u.dcmd_fd_io.fd, buf, size);
-
- reply.dcmd_msg = DCTL_GEN_REPLY;
- reply.u.dcmd_reply.rc = wrc == -1 ? errno : 0;
- reply.u.dcmd_reply.size = wrc;
-
- error = dctl_send_msg(fd, &reply);
-
-out:
- free(buf);
-
- return error;
-}
-
-int dctlc_ioctl(int fd, int32_t request, void *arg)
-{
- int error;
- dctl_cmd_t cmd;
-
- ASSERT(fd != 0);
-
- cmd.dcmd_msg = DCTL_IOCTL;
-
- cmd.u.dcmd_ioctl.cmd = request;
- cmd.u.dcmd_ioctl.arg = (uintptr_t) arg;
-
- error = dctl_send_msg(fd, &cmd);
-
- while (!error && (error = dctl_read_msg(fd, &cmd)) == 0) {
- switch (cmd.dcmd_msg) {
- case DCTL_IOCTL_REPLY:
- error = cmd.u.dcmd_reply.rc;
- goto out;
- case DCTL_COPYIN:
- error = dctl_reply_copyin(fd, &cmd);
- break;
- case DCTL_COPYINSTR:
- error = dctl_reply_copyinstr(fd, &cmd);
- break;
- case DCTL_COPYOUT:
- error = dctl_reply_copyout(fd, &cmd);
- break;
- case DCTL_FD_READ:
- error = dctl_reply_fd_read(fd, &cmd);
- break;
- case DCTL_FD_WRITE:
- error = dctl_reply_fd_write(fd, &cmd);
- break;
- default:
- fprintf(stderr, "%s(): invalid message "
- "received.\n", __func__);
- error = EINVAL;
- goto out;
- }
- }
-
-out:
- errno = error;
- return error ? -1 : 0;
-}
diff --git a/zfs/lib/libdmu-ctl/dctl_common.c b/zfs/lib/libdmu-ctl/dctl_common.c
deleted file mode 100644
index 8de37dcb1..000000000
--- a/zfs/lib/libdmu-ctl/dctl_common.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <stdio.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-
-#include <sys/dmu_ctl.h>
-#include <sys/dmu_ctl_impl.h>
-
-int dctl_read_msg(int fd, dctl_cmd_t *cmd)
-{
- int error;
-
- /*
- * First, read only the magic number and the protocol version.
- *
- * This prevents blocking forever in case the size of dctl_cmd_t
- * shrinks in future protocol versions.
- */
- error = dctl_read_data(fd, cmd, DCTL_CMD_HEADER_SIZE);
-
- if (!error &&cmd->dcmd_magic != DCTL_MAGIC) {
- fprintf(stderr, "%s(): invalid magic number\n", __func__);
- error = EIO;
- }
-
- if (!error && cmd->dcmd_version != DCTL_PROTOCOL_VER) {
- fprintf(stderr, "%s(): invalid protocol version\n", __func__);
- error = ENOTSUP;
- }
-
- if (error)
- return error;
-
- /* Get the rest of the command */
- return dctl_read_data(fd, (caddr_t) cmd + DCTL_CMD_HEADER_SIZE,
- sizeof(dctl_cmd_t) - DCTL_CMD_HEADER_SIZE);
-}
-
-int dctl_send_msg(int fd, dctl_cmd_t *cmd)
-{
- cmd->dcmd_magic = DCTL_MAGIC;
- cmd->dcmd_version = DCTL_PROTOCOL_VER;
-
- return dctl_send_data(fd, cmd, sizeof(dctl_cmd_t));
-}
-
-int dctl_read_data(int fd, void *ptr, size_t size)
-{
- size_t read = 0;
- size_t left = size;
- ssize_t rc;
-
- while (left > 0) {
- rc = recv(fd, (caddr_t) ptr + read, left, 0);
-
- /* File descriptor closed */
- if (rc == 0)
- return ECONNRESET;
-
- if (rc == -1) {
- if (errno == EINTR)
- continue;
- return errno;
- }
-
- read += rc;
- left -= rc;
- }
-
- return 0;
-}
-
-int dctl_send_data(int fd, const void *ptr, size_t size)
-{
- ssize_t rc;
-
- do {
- rc = send(fd, ptr, size, MSG_NOSIGNAL);
- } while(rc == -1 && errno == EINTR);
-
- return rc == size ? 0 : EIO;
-}
-
diff --git a/zfs/lib/libdmu-ctl/dctl_server.c b/zfs/lib/libdmu-ctl/dctl_server.c
deleted file mode 100644
index 016278509..000000000
--- a/zfs/lib/libdmu-ctl/dctl_server.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <stdio.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <signal.h>
-#include <limits.h>
-#include <errno.h>
-#include <poll.h>
-#include <pthread.h>
-#include <unistd.h>
-#include <sys/debug.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/un.h>
-#include <sys/list.h>
-#include <sys/cred.h>
-
-#include <sys/dmu_ctl.h>
-#include <sys/dmu_ctl_impl.h>
-
-static dctl_sock_info_t ctl_sock = {
- .dsi_mtx = PTHREAD_MUTEX_INITIALIZER,
- .dsi_fd = -1
-};
-
-static int dctl_create_socket_common();
-
-/*
- * Routines from zfs_ioctl.c
- */
-extern int zfs_ioctl_init();
-extern int zfs_ioctl_fini();
-extern int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
- int *rvalp);
-
-/*
- * We can't simply put the client file descriptor in wthr_info_t because we
- * have no way of accessing it from the DMU code without extensive
- * modifications.
- *
- * Therefore each worker thread will have it's own global thread-specific
- * client_fd variable.
- */
-static __thread int client_fd = -1;
-
-int dctls_copyin(const void *src, void *dest, size_t size)
-{
- dctl_cmd_t cmd;
-
- VERIFY(client_fd >= 0);
-
- cmd.dcmd_msg = DCTL_COPYIN;
- cmd.u.dcmd_copy.ptr = (uintptr_t) src;
- cmd.u.dcmd_copy.size = size;
-
- if (dctl_send_msg(client_fd, &cmd) != 0)
- return EFAULT;
-
- if (dctl_read_data(client_fd, dest, size) != 0)
- return EFAULT;
-
- return 0;
-}
-
-int dctls_copyinstr(const char *from, char *to, size_t max, size_t *len)
-{
- dctl_cmd_t msg;
- size_t copied;
-
- VERIFY(client_fd >= 0);
-
- if (max == 0)
- return ENAMETOOLONG;
- if (max < 0)
- return EFAULT;
-
- msg.dcmd_msg = DCTL_COPYINSTR;
- msg.u.dcmd_copy.ptr = (uintptr_t) from;
- msg.u.dcmd_copy.size = max;
-
- if (dctl_send_msg(client_fd, &msg) != 0)
- return EFAULT;
-
- if (dctl_read_msg(client_fd, &msg) != 0)
- return EFAULT;
-
- if (msg.dcmd_msg != DCTL_GEN_REPLY)
- return EFAULT;
-
- copied = msg.u.dcmd_reply.size;
-
- if (copied >= max)
- return EFAULT;
-
- if (copied > 0)
- if (dctl_read_data(client_fd, to, copied) != 0)
- return EFAULT;
-
- to[copied] = '\0';
-
- if (len != NULL)
- *len = copied + 1;
-
- return msg.u.dcmd_reply.rc;
-}
-
-int dctls_copyout(const void *src, void *dest, size_t size)
-{
- dctl_cmd_t cmd;
-
- VERIFY(client_fd >= 0);
-
- cmd.dcmd_msg = DCTL_COPYOUT;
- cmd.u.dcmd_copy.ptr = (uintptr_t) dest;
- cmd.u.dcmd_copy.size = size;
-
- if (dctl_send_msg(client_fd, &cmd) != 0)
- return EFAULT;
-
- if (dctl_send_data(client_fd, src, size) != 0)
- return EFAULT;
-
- return 0;
-}
-
-int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp)
-{
- dctl_cmd_t msg;
- uint64_t dsize;
- int error;
-
- VERIFY(client_fd >= 0);
-
- msg.dcmd_msg = DCTL_FD_READ;
- msg.u.dcmd_fd_io.fd = fd;
- msg.u.dcmd_fd_io.size = len;
-
- if ((error = dctl_send_msg(client_fd, &msg)) != 0)
- return error;
-
- if ((error = dctl_read_msg(client_fd, &msg)) != 0)
- return error;
-
- if (msg.dcmd_msg != DCTL_GEN_REPLY)
- return EIO;
-
- if (msg.u.dcmd_reply.rc != 0)
- return msg.u.dcmd_reply.rc;
-
- dsize = msg.u.dcmd_reply.size;
-
- if (dsize > 0)
- error = dctl_read_data(client_fd, buf, dsize);
-
- *residp = len - dsize;
-
- return error;
-}
-
-int dctls_fd_write(int fd, const void *src, ssize_t len)
-{
- dctl_cmd_t msg;
- int error;
-
- VERIFY(client_fd >= 0);
-
- msg.dcmd_msg = DCTL_FD_WRITE;
- msg.u.dcmd_fd_io.fd = fd;
- msg.u.dcmd_fd_io.size = len;
-
- error = dctl_send_msg(client_fd, &msg);
-
- if (!error)
- error = dctl_send_data(client_fd, src, len);
-
- if (!error)
- error = dctl_read_msg(client_fd, &msg);
-
- if (error)
- return error;
-
- if (msg.dcmd_msg != DCTL_GEN_REPLY)
- return EIO;
-
- if (msg.u.dcmd_reply.rc != 0)
- return msg.u.dcmd_reply.rc;
-
- /*
- * We have to do this because the original upstream code
- * does not check if residp == len.
- */
- if (msg.u.dcmd_reply.size != len)
- return EIO;
-
- return 0;
-}
-
-/* Handle a new connection */
-static void dctl_handle_conn(int sock_fd)
-{
- dctl_cmd_t cmd;
- dev_t dev = { 0 };
- int rc;
-
- client_fd = sock_fd;
-
- while (dctl_read_msg(sock_fd, &cmd) == 0) {
- if (cmd.dcmd_msg != DCTL_IOCTL) {
- fprintf(stderr, "%s(): unexpected message type.\n",
- __func__);
- break;
- }
-
- rc = zfsdev_ioctl(dev, cmd.u.dcmd_ioctl.cmd,
- (intptr_t) cmd.u.dcmd_ioctl.arg, 0, NULL, NULL);
-
- cmd.dcmd_msg = DCTL_IOCTL_REPLY;
- cmd.u.dcmd_reply.rc = rc;
-
- if (dctl_send_msg(sock_fd, &cmd) != 0)
- break;
- }
- close(sock_fd);
-
- client_fd = -1;
-}
-
-/* Main worker thread loop */
-static void *dctl_thread(void *arg)
-{
- wthr_info_t *thr = arg;
- struct pollfd fds[1];
-
- fds[0].events = POLLIN;
-
- pthread_mutex_lock(&ctl_sock.dsi_mtx);
-
- while (!thr->wthr_exit) {
- /* Clean-up dead threads */
- dctl_thr_join();
-
- /* The file descriptor might change in the thread lifetime */
- fds[0].fd = ctl_sock.dsi_fd;
-
- /* Poll socket with 1-second timeout */
- int rc = poll(fds, 1, 1000);
- if (rc == 0 || (rc == -1 && errno == EINTR))
- continue;
-
- /* Recheck the exit flag */
- if (thr->wthr_exit)
- break;
-
- if (rc == -1) {
- /* Unknown error, let's try to recreate the socket */
- close(ctl_sock.dsi_fd);
- ctl_sock.dsi_fd = -1;
-
- if (dctl_create_socket_common() != 0)
- break;
-
- continue;
- }
- ASSERT(rc == 1);
-
- short rev = fds[0].revents;
- if (rev == 0)
- continue;
- ASSERT(rev == POLLIN);
-
- /*
- * At this point there should be a connection ready to be
- * accepted.
- */
- int client_fd = accept(ctl_sock.dsi_fd, NULL, NULL);
- /* Many possible errors here, we'll just retry */
- if (client_fd == -1)
- continue;
-
- /*
- * Now lets handle the request. This can take a very
- * long time (hours even), so we'll let other threads
- * handle new connections.
- */
- pthread_mutex_unlock(&ctl_sock.dsi_mtx);
-
- dctl_thr_rebalance(thr, B_FALSE);
- dctl_handle_conn(client_fd);
- dctl_thr_rebalance(thr, B_TRUE);
-
- pthread_mutex_lock(&ctl_sock.dsi_mtx);
- }
- pthread_mutex_unlock(&ctl_sock.dsi_mtx);
-
- dctl_thr_die(thr);
-
- return NULL;
-}
-
-static int dctl_create_socket_common()
-{
- dctl_sock_info_t *s = &ctl_sock;
- size_t size;
- int error;
-
- ASSERT(s->dsi_fd == -1);
-
- /*
- * Unlink old socket, in case it exists.
- * We don't care about errors here.
- */
- unlink(s->dsi_path);
-
- /* Create the socket */
- s->dsi_fd = socket(PF_UNIX, SOCK_STREAM, 0);
- if (s->dsi_fd == -1) {
- error = errno;
- perror("socket");
- return error;
- }
-
- s->dsi_addr.sun_family = AF_UNIX;
-
- size = sizeof(s->dsi_addr.sun_path) - 1;
- strncpy(s->dsi_addr.sun_path, s->dsi_path, size);
-
- s->dsi_addr.sun_path[size] = '\0';
-
- if (bind(s->dsi_fd, (struct sockaddr *) &s->dsi_addr,
- sizeof(s->dsi_addr)) != 0) {
- error = errno;
- perror("bind");
- return error;
- }
-
- if (listen(s->dsi_fd, LISTEN_BACKLOG) != 0) {
- error = errno;
- perror("listen");
- unlink(s->dsi_path);
- return error;
- }
-
- return 0;
-}
-
-static int dctl_create_socket(const char *cfg_dir)
-{
- int error;
- dctl_sock_info_t *s = &ctl_sock;
-
- ASSERT(s->dsi_path == NULL);
- ASSERT(s->dsi_fd == -1);
-
- int pathsize = strlen(cfg_dir) + strlen(SOCKNAME) + 2;
- if (pathsize > sizeof(s->dsi_addr.sun_path))
- return ENAMETOOLONG;
-
- s->dsi_path = malloc(pathsize);
- if (s->dsi_path == NULL)
- return ENOMEM;
-
- strcpy(s->dsi_path, cfg_dir);
- strcat(s->dsi_path, "/" SOCKNAME);
-
- /*
- * For convenience, create the directory in case it doesn't exist.
- * We don't care about errors here.
- */
- mkdir(cfg_dir, 0770);
-
- error = dctl_create_socket_common();
-
- if (error) {
- free(s->dsi_path);
-
- if (s->dsi_fd != -1) {
- close(s->dsi_fd);
- s->dsi_fd = -1;
- }
- }
-
- return error;
-}
-
-static void dctl_destroy_socket()
-{
- dctl_sock_info_t *s = &ctl_sock;
-
- ASSERT(s->dsi_path != NULL);
- ASSERT(s->dsi_fd != -1);
-
- close(s->dsi_fd);
- s->dsi_fd = -1;
-
- unlink(s->dsi_path);
- free(s->dsi_path);
-}
-
-/*
- * Initialize the DMU userspace control interface.
- * This should be called after kernel_init().
- *
- * Note that only very rarely we have more than a couple of simultaneous
- * lzfs/lzpool connections. Since the thread pool grows automatically when all
- * threads are busy, a good value for min_thr and max_free_thr is 2.
- */
-int dctl_server_init(const char *cfg_dir, int min_thr, int max_free_thr)
-{
- int error;
-
- ASSERT(min_thr > 0);
- ASSERT(max_free_thr >= min_thr);
-
- error = zfs_ioctl_init();
- if (error)
- return error;
-
- error = dctl_create_socket(cfg_dir);
- if (error) {
- (void) zfs_ioctl_fini();
- return error;
- }
-
- error = dctl_thr_pool_create(min_thr, max_free_thr, dctl_thread);
- if (error) {
- (void) zfs_ioctl_fini();
- dctl_destroy_socket();
- return error;
- }
-
- return 0;
-}
-
-/*
- * Terminate control interface.
- * This should be called after closing all objsets, but before calling
- * kernel_fini().
- * May return EBUSY if the SPA is busy.
- *
- * Thread pool destruction can take a while due to poll()
- * timeout or due to a thread being busy (e.g. a backup is being taken).
- */
-int dctl_server_fini()
-{
- dctl_thr_pool_stop();
- dctl_destroy_socket();
-
- return zfs_ioctl_fini();
-}
diff --git a/zfs/lib/libdmu-ctl/dctl_thrpool.c b/zfs/lib/libdmu-ctl/dctl_thrpool.c
deleted file mode 100644
index 7b2f9b4c2..000000000
--- a/zfs/lib/libdmu-ctl/dctl_thrpool.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <stdlib.h>
-#include <stddef.h>
-#include <time.h>
-#include <pthread.h>
-#include <errno.h>
-#include <sys/list.h>
-#include <sys/debug.h>
-
-#include <sys/dmu_ctl.h>
-#include <sys/dmu_ctl_impl.h>
-
-static dctl_thr_info_t thr_pool = {
- .dti_mtx = PTHREAD_MUTEX_INITIALIZER
-};
-
-/*
- * Create n threads.
- * Callers must acquire thr_pool.dti_mtx first.
- */
-static int dctl_thr_create(int n)
-{
- dctl_thr_info_t *p = &thr_pool;
- int error;
-
- for (int i = 0; i < n; i++) {
- wthr_info_t *thr = malloc(sizeof(wthr_info_t));
- if (thr == NULL)
- return ENOMEM;
-
- thr->wthr_exit = B_FALSE;
- thr->wthr_free = B_TRUE;
-
- error = pthread_create(&thr->wthr_id, NULL, p->dti_thr_func,
- thr);
- if (error) {
- free(thr);
- return error;
- }
-
- p->dti_free++;
-
- list_insert_tail(&p->dti_list, thr);
- }
- return 0;
-}
-
-/*
- * Mark the thread as dead.
- * Must be called right before exiting the main thread function.
- */
-void dctl_thr_die(wthr_info_t *thr)
-{
- dctl_thr_info_t *p = &thr_pool;
-
- thr->wthr_exit = B_TRUE;
- dctl_thr_rebalance(thr, B_FALSE);
-
- pthread_mutex_lock(&p->dti_mtx);
-
- list_remove(&p->dti_list, thr);
- list_insert_tail(&p->dti_join_list, thr);
-
- pthread_mutex_unlock(&p->dti_mtx);
-}
-
-/*
- * Clean-up dead threads.
- */
-void dctl_thr_join()
-{
- dctl_thr_info_t *p = &thr_pool;
- wthr_info_t *thr;
-
- pthread_mutex_lock(&p->dti_mtx);
-
- while ((thr = list_head(&p->dti_join_list))) {
- list_remove(&p->dti_join_list, thr);
-
- ASSERT(!pthread_equal(thr->wthr_id, pthread_self()));
-
- /*
- * This should not block because all the threads
- * on this list should have died already.
- *
- * pthread_join() can only return an error if
- * we made a programming mistake.
- */
- VERIFY(pthread_join(thr->wthr_id, NULL) == 0);
-
- ASSERT(thr->wthr_exit);
- ASSERT(!thr->wthr_free);
-
- free(thr);
- }
-
- pthread_mutex_unlock(&p->dti_mtx);
-}
-
-/*
- * Adjust the number of free threads in the pool and the thread status.
- *
- * Callers must acquire thr_pool.dti_mtx first.
- */
-static void dctl_thr_adjust_free(wthr_info_t *thr, boolean_t set_free)
-{
- dctl_thr_info_t *p = &thr_pool;
-
- ASSERT(p->dti_free >= 0);
-
- if (!thr->wthr_free && set_free)
- p->dti_free++;
- else if (thr->wthr_free && !set_free)
- p->dti_free--;
-
- ASSERT(p->dti_free >= 0);
-
- thr->wthr_free = set_free;
-}
-
-/*
- * Rebalance threads. Also adjusts the free status of the thread.
- * Will set the thread exit flag if the number of free threads is above
- * the limit.
- */
-void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free)
-{
- dctl_thr_info_t *p = &thr_pool;
-
- pthread_mutex_lock(&p->dti_mtx);
-
- if (p->dti_exit || p->dti_free > p->dti_max_free)
- thr->wthr_exit = B_TRUE;
-
- if (thr->wthr_exit)
- set_free = B_FALSE;
-
- dctl_thr_adjust_free(thr, set_free);
-
- if (!p->dti_exit && p->dti_free == 0)
- dctl_thr_create(1);
-
- pthread_mutex_unlock(&p->dti_mtx);
-}
-
-/*
- * Stop the thread pool.
- *
- * This can take a while since it actually waits for all threads to exit.
- */
-void dctl_thr_pool_stop()
-{
- dctl_thr_info_t *p = &thr_pool;
- wthr_info_t *thr;
- struct timespec ts;
-
- pthread_mutex_lock(&p->dti_mtx);
-
- ASSERT(!p->dti_exit);
- p->dti_exit = B_TRUE;
-
- /* Let's flag the threads first */
- thr = list_head(&p->dti_list);
- while (thr != NULL) {
- thr->wthr_exit = B_TRUE;
- dctl_thr_adjust_free(thr, B_FALSE);
-
- thr = list_next(&p->dti_list, thr);
- }
-
- pthread_mutex_unlock(&p->dti_mtx);
-
- /* Now let's wait for them to exit */
- ts.tv_sec = 0;
- ts.tv_nsec = 50000000; /* 50ms */
- do {
- nanosleep(&ts, NULL);
-
- pthread_mutex_lock(&p->dti_mtx);
- thr = list_head(&p->dti_list);
- pthread_mutex_unlock(&p->dti_mtx);
-
- dctl_thr_join();
- } while(thr != NULL);
-
- ASSERT(p->dti_free == 0);
-
- ASSERT(list_is_empty(&p->dti_list));
- ASSERT(list_is_empty(&p->dti_join_list));
-
- list_destroy(&p->dti_list);
- list_destroy(&p->dti_join_list);
-}
-
-/*
- * Create thread pool.
- *
- * If at least one thread creation fails, it will stop all previous
- * threads and return a non-zero value.
- */
-int dctl_thr_pool_create(int min_thr, int max_free_thr,
- thr_func_t *thr_func)
-{
- int error;
- dctl_thr_info_t *p = &thr_pool;
-
- ASSERT(p->dti_free == 0);
-
- /* Initialize global variables */
- p->dti_min = min_thr;
- p->dti_max_free = max_free_thr;
- p->dti_exit = B_FALSE;
- p->dti_thr_func = thr_func;
-
- list_create(&p->dti_list, sizeof(wthr_info_t), offsetof(wthr_info_t,
- wthr_node));
- list_create(&p->dti_join_list, sizeof(wthr_info_t),
- offsetof(wthr_info_t, wthr_node));
-
- pthread_mutex_lock(&p->dti_mtx);
- error = dctl_thr_create(min_thr);
- pthread_mutex_unlock(&p->dti_mtx);
-
- if (error)
- dctl_thr_pool_stop();
-
- return error;
-}
diff --git a/zfs/lib/libdmu-ctl/dmu_send.c b/zfs/lib/libdmu-ctl/dmu_send.c
deleted file mode 100644
index 1c72f9507..000000000
--- a/zfs/lib/libdmu-ctl/dmu_send.c
+++ /dev/null
@@ -1,1249 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)dmu_send.c 1.14 08/04/27 SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-
-static char *dmu_recv_tag = "dmu_recv_tag";
-
-struct backuparg {
- dmu_replay_record_t *drr;
- vnode_t *vp;
- offset_t *off;
- objset_t *os;
- zio_cksum_t zc;
- int err;
-};
-
-static int
-dump_bytes(struct backuparg *ba, void *buf, int len)
-{
- ssize_t resid; /* have to get resid to get detailed errno */
- ASSERT3U(len % 8, ==, 0);
-
- fletcher_4_incremental_native(buf, len, &ba->zc);
- ba->err = vn_rdwr(UIO_WRITE, ba->vp,
- (caddr_t)buf, len,
- 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
- *ba->off += len;
- return (ba->err);
-}
-
-static int
-dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
- uint64_t length)
-{
- /* write a FREE record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_FREE;
- ba->drr->drr_u.drr_free.drr_object = object;
- ba->drr->drr_u.drr_free.drr_offset = offset;
- ba->drr->drr_u.drr_free.drr_length = length;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
- return (0);
-}
-
-static int
-dump_data(struct backuparg *ba, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int blksz, void *data)
-{
- /* write a DATA record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_WRITE;
- ba->drr->drr_u.drr_write.drr_object = object;
- ba->drr->drr_u.drr_write.drr_type = type;
- ba->drr->drr_u.drr_write.drr_offset = offset;
- ba->drr->drr_u.drr_write.drr_length = blksz;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
- if (dump_bytes(ba, data, blksz))
- return (EINTR);
- return (0);
-}
-
-static int
-dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
-{
- /* write a FREEOBJECTS record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_FREEOBJECTS;
- ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
- ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
- return (0);
-}
-
-static int
-dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
-{
- if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
- return (dump_freeobjects(ba, object, 1));
-
- /* write an OBJECT record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_OBJECT;
- ba->drr->drr_u.drr_object.drr_object = object;
- ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
- ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
- ba->drr->drr_u.drr_object.drr_blksz =
- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
- ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
- ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
- ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
-
- if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
- return (EINTR);
-
- /* free anything past the end of the file */
- if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
- return (EINTR);
- if (ba->err)
- return (EINTR);
- return (0);
-}
-
-#define BP_SPAN(dnp, level) \
- (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
- (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
-
-static int
-backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
- struct backuparg *ba = arg;
- uint64_t object = bc->bc_bookmark.zb_object;
- int level = bc->bc_bookmark.zb_level;
- uint64_t blkid = bc->bc_bookmark.zb_blkid;
- blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
- dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
- void *data = bc->bc_data;
- int err = 0;
-
- if (issig(JUSTLOOKING) && issig(FORREAL))
- return (EINTR);
-
- ASSERT(data || bp == NULL);
-
- if (bp == NULL && object == 0) {
- uint64_t span = BP_SPAN(bc->bc_dnode, level);
- uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
- err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
- } else if (bp == NULL) {
- uint64_t span = BP_SPAN(bc->bc_dnode, level);
- err = dump_free(ba, object, blkid * span, span);
- } else if (data && level == 0 && type == DMU_OT_DNODE) {
- dnode_phys_t *blk = data;
- int i;
- int blksz = BP_GET_LSIZE(bp);
-
- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
- uint64_t dnobj =
- (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
- err = dump_dnode(ba, dnobj, blk+i);
- if (err)
- break;
- }
- } else if (level == 0 &&
- type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
- int blksz = BP_GET_LSIZE(bp);
- if (data == NULL) {
- uint32_t aflags = ARC_WAIT;
- arc_buf_t *abuf;
- zbookmark_t zb;
-
- zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
- zb.zb_object = object;
- zb.zb_level = level;
- zb.zb_blkid = blkid;
- (void) arc_read(NULL, spa, bp,
- dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
- &aflags, &zb);
-
- if (abuf) {
- err = dump_data(ba, type, object, blkid * blksz,
- blksz, abuf->b_data);
- (void) arc_buf_remove_ref(abuf, &abuf);
- }
- } else {
- err = dump_data(ba, type, object, blkid * blksz,
- blksz, data);
- }
- }
-
- ASSERT(err == 0 || err == EINTR);
- return (err);
-}
-
-int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
- vnode_t *vp, offset_t *off)
-{
- dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
- dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
- dmu_replay_record_t *drr;
- struct backuparg ba;
- int err;
- uint64_t fromtxg = 0;
-
- /* tosnap must be a snapshot */
- if (ds->ds_phys->ds_next_snap_obj == 0)
- return (EINVAL);
-
- /* fromsnap must be an earlier snapshot from the same fs as tosnap */
- if (fromds && (ds->ds_dir != fromds->ds_dir ||
- fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
- return (EXDEV);
-
- if (fromorigin) {
- if (fromsnap)
- return (EINVAL);
-
- if (ds->ds_dir->dd_phys->dd_origin_obj != NULL) {
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- err = dsl_dataset_open_obj(dp,
- ds->ds_dir->dd_phys->dd_origin_obj, NULL,
- DS_MODE_NONE, FTAG, &fromds);
- rw_exit(&dp->dp_config_rwlock);
- if (err)
- return (err);
- } else {
- fromorigin = B_FALSE;
- }
- }
-
-
- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
- drr->drr_type = DRR_BEGIN;
- drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
- drr->drr_u.drr_begin.drr_creation_time =
- ds->ds_phys->ds_creation_time;
- drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
- if (fromorigin)
- drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
- drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
- if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
- drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
-
- if (fromds)
- drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
- dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-
- if (fromds)
- fromtxg = fromds->ds_phys->ds_creation_txg;
- if (fromorigin)
- dsl_dataset_close(fromds, DS_MODE_NONE, FTAG);
-
- ba.drr = drr;
- ba.vp = vp;
- ba.os = tosnap;
- ba.off = off;
- ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
-
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
- kmem_free(drr, sizeof (dmu_replay_record_t));
- return (ba.err);
- }
-
- err = traverse_dsl_dataset(ds, fromtxg,
- ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
- backup_cb, &ba);
-
- if (err) {
- if (err == EINTR && ba.err)
- err = ba.err;
- kmem_free(drr, sizeof (dmu_replay_record_t));
- return (err);
- }
-
- bzero(drr, sizeof (dmu_replay_record_t));
- drr->drr_type = DRR_END;
- drr->drr_u.drr_end.drr_checksum = ba.zc;
-
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
- kmem_free(drr, sizeof (dmu_replay_record_t));
- return (ba.err);
- }
-
- kmem_free(drr, sizeof (dmu_replay_record_t));
-
- return (0);
-}
-
-struct recvbeginsyncarg {
- const char *tofs;
- const char *tosnap;
- dsl_dataset_t *origin;
- uint64_t fromguid;
- dmu_objset_type_t type;
- void *tag;
- boolean_t force;
- uint64_t dsflags;
- char clonelastname[MAXNAMELEN];
- dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
-};
-
-static dsl_dataset_t *
-recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
- cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds;
-
- VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL,
- DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds));
-
- if (type != DMU_OST_NONE) {
- (void) dmu_objset_create_impl(dp->dp_spa,
- ds, &ds->ds_phys->ds_bp, type, tx);
- }
-
- spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
- ds->ds_phys->ds_dir_obj);
-
- return (ds);
-}
-
-/* ARGSUSED */
-static int
-recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- uint64_t val;
- int err;
-
- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
- strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
-
- if (err != ENOENT)
- return (err ? err : EEXIST);
-
- if (rbsa->origin) {
- /* make sure it's a snap in the same pool */
- if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
- return (EXDEV);
- if (rbsa->origin->ds_phys->ds_num_children == 0)
- return (EINVAL);
- if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
- }
-
- return (0);
-}
-
-static void
-recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- uint64_t dsobj;
- uint64_t flags = DS_FLAG_INCONSISTENT;
-
- flags |= rbsa->dsflags;
-
- dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
- rbsa->origin, flags, cr, tx);
-
- rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
- rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
-
-static int
-recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- int err;
-
- /* must be a head ds */
- if (ds->ds_phys->ds_next_snap_obj != 0)
- return (EINVAL);
-
- /* must not be a clone ds */
- if (ds->ds_prev != NULL)
- return (EINVAL);
-
- err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
- if (err)
- return (err);
-
- if (rbsa->origin) {
- /* make sure it's a snap in the same pool */
- if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
- return (EXDEV);
- if (rbsa->origin->ds_phys->ds_num_children == 0)
- return (EINVAL);
- if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
- }
-
- return (0);
-}
-
-static void
-recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- dsl_dir_t *dd = ds->ds_dir;
- uint64_t dsobj;
- uint64_t flags = DS_FLAG_INCONSISTENT;
-
- flags |= rbsa->dsflags;
-
- /*
- * NB: caller must provide an extra hold on the dsl_dir_t, so it
- * won't go away when dsl_dataset_destroy_sync() closes the
- * dataset.
- */
- dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
-
- dsobj = dsl_dataset_create_sync_impl(dd, rbsa->origin, flags, tx);
-
- rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
- rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
-
-/* ARGSUSED */
-static int
-recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- int err;
- uint64_t val;
-
- /* must not have any changes since most recent snapshot */
- if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
- return (ETXTBSY);
-
- /* must already be a snapshot of this fs */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- return (ENODEV);
-
- /* most recent snapshot must match fromguid */
- if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
-
- /* temporary clone name must not exist */
- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_dir->dd_phys->dd_child_dir_zapobj,
- rbsa->clonelastname, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
- return (err);
-
- /* new snapshot name must not exist */
- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
- return (err);
- return (0);
-}
-
-/* ARGSUSED */
-static void
-recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ohds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- dsl_pool_t *dp = ohds->ds_dir->dd_pool;
- dsl_dataset_t *ods, *cds;
- uint64_t dsobj;
- uint64_t flags = DS_FLAG_INCONSISTENT;
-
- flags |= rbsa->dsflags;
-
- /* create the temporary clone */
- VERIFY(0 == dsl_dataset_open_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
- NULL, DS_MODE_STANDARD, FTAG, &ods));
- dsobj = dsl_dataset_create_sync(ohds->ds_dir,
- rbsa->clonelastname, ods, flags, cr, tx);
- dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
-
- /* open the temporary clone */
- VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL,
- DS_MODE_EXCLUSIVE, dmu_recv_tag, &cds));
-
- /* copy the refquota from the target fs to the clone */
- if (ohds->ds_quota > 0)
- dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
-
- rbsa->ds = cds;
-
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- dp->dp_spa, tx, cr, "dataset = %lld",
- cds->ds_phys->ds_dir_obj);
-}
-
-/* ARGSUSED */
-static void
-recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
- ds->ds_phys->ds_dir_obj);
-}
-
-/*
- * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
- * succeeds; otherwise we will leak the holds on the datasets.
- */
-int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
-{
- int err = 0;
- boolean_t byteswap;
- struct recvbeginsyncarg rbsa;
- uint64_t version;
- int flags;
- dsl_dataset_t *ds;
-
- if (drrb->drr_magic == DMU_BACKUP_MAGIC)
- byteswap = FALSE;
- else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
- byteswap = TRUE;
- else
- return (EINVAL);
-
- rbsa.tofs = tofs;
- rbsa.tosnap = tosnap;
- rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
- rbsa.fromguid = drrb->drr_fromguid;
- rbsa.type = drrb->drr_type;
- rbsa.tag = FTAG;
- rbsa.dsflags = 0;
- version = drrb->drr_version;
- flags = drrb->drr_flags;
-
- if (byteswap) {
- rbsa.type = BSWAP_32(rbsa.type);
- rbsa.fromguid = BSWAP_64(rbsa.fromguid);
- version = BSWAP_64(version);
- flags = BSWAP_32(flags);
- }
-
- if (version != DMU_BACKUP_STREAM_VERSION ||
- rbsa.type >= DMU_OST_NUMTYPES ||
- ((flags & DRR_FLAG_CLONE) && origin == NULL))
- return (EINVAL);
-
- if (flags & DRR_FLAG_CI_DATA)
- rbsa.dsflags = DS_FLAG_CI_DATASET;
-
- bzero(drc, sizeof (dmu_recv_cookie_t));
- drc->drc_drrb = drrb;
- drc->drc_tosnap = tosnap;
- drc->drc_force = force;
-
- /*
- * Process the begin in syncing context.
- */
- if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
- /* offline incremental receive */
- err = dsl_dataset_open(tofs,
- DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds);
- if (err)
- return (err);
-
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- rbsa.fromguid) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE,
- dmu_recv_tag);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
- }
- rbsa.force = B_FALSE;
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_offline_incremental_sync,
- ds, &rbsa, 1);
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, dmu_recv_tag);
- return (err);
- }
- drc->drc_logical_ds = drc->drc_real_ds = ds;
- } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
- /* online incremental receive */
-
- /* tmp clone name is: tofs/%tosnap" */
- (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
- "%%%s", tosnap);
-
- /* open the dataset we are logically receiving into */
- err = dsl_dataset_open(tofs,
- DS_MODE_STANDARD, dmu_recv_tag, &ds);
- if (err)
- return (err);
-
- rbsa.force = force;
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_online_incremental_sync, ds, &rbsa, 5);
- if (err) {
- dsl_dataset_close(ds, DS_MODE_STANDARD, dmu_recv_tag);
- return (err);
- }
- drc->drc_logical_ds = ds;
- drc->drc_real_ds = rbsa.ds;
- } else {
- /* create new fs -- full backup or clone */
- dsl_dir_t *dd = NULL;
- const char *tail;
-
- err = dsl_dir_open(tofs, FTAG, &dd, &tail);
- if (err)
- return (err);
- if (tail == NULL) {
- if (!force) {
- dsl_dir_close(dd, FTAG);
- return (EEXIST);
- }
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- err = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- if (err) {
- dsl_dir_close(dd, FTAG);
- return (err);
- }
-
- err = dsl_sync_task_do(dd->dd_pool,
- recv_full_existing_check,
- recv_full_existing_sync, ds, &rbsa, 5);
- /* if successful, sync task closes the ds for us */
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
- recv_full_sync, dd, &rbsa, 5);
- if (err)
- return (err);
- }
- dsl_dir_close(dd, FTAG);
- if (err)
- return (err);
- drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
- drc->drc_newfs = B_TRUE;
- }
-
- /* downgrade our hold on the ds from EXCLUSIVE to PRIMARY */
- dsl_dataset_downgrade(drc->drc_real_ds,
- DS_MODE_EXCLUSIVE, DS_MODE_PRIMARY);
-
- return (0);
-}
-
-struct restorearg {
- int err;
- int byteswap;
- vnode_t *vp;
- char *buf;
- uint64_t voff;
- int bufsize; /* amount of memory allocated for buf */
- zio_cksum_t cksum;
-};
-
-static void *
-restore_read(struct restorearg *ra, int len)
-{
- void *rv;
- int done = 0;
-
- /* some things will require 8-byte alignment, so everything must */
- ASSERT3U(len % 8, ==, 0);
-
- while (done < len) {
- ssize_t resid;
-
- ra->err = vn_rdwr(UIO_READ, ra->vp,
- (caddr_t)ra->buf + done, len - done,
- ra->voff, UIO_SYSSPACE, FAPPEND,
- RLIM64_INFINITY, CRED(), &resid);
-
- if (resid == len - done)
- ra->err = EINVAL;
- ra->voff += len - done - resid;
- done = len - resid;
- if (ra->err)
- return (NULL);
- }
-
- ASSERT3U(done, ==, len);
- rv = ra->buf;
- if (ra->byteswap)
- fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
- else
- fletcher_4_incremental_native(rv, len, &ra->cksum);
- return (rv);
-}
-
-static void
-backup_byteswap(dmu_replay_record_t *drr)
-{
-#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
-#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
- drr->drr_type = BSWAP_32(drr->drr_type);
- drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
- switch (drr->drr_type) {
- case DRR_BEGIN:
- DO64(drr_begin.drr_magic);
- DO64(drr_begin.drr_version);
- DO64(drr_begin.drr_creation_time);
- DO32(drr_begin.drr_type);
- DO32(drr_begin.drr_flags);
- DO64(drr_begin.drr_toguid);
- DO64(drr_begin.drr_fromguid);
- break;
- case DRR_OBJECT:
- DO64(drr_object.drr_object);
- /* DO64(drr_object.drr_allocation_txg); */
- DO32(drr_object.drr_type);
- DO32(drr_object.drr_bonustype);
- DO32(drr_object.drr_blksz);
- DO32(drr_object.drr_bonuslen);
- break;
- case DRR_FREEOBJECTS:
- DO64(drr_freeobjects.drr_firstobj);
- DO64(drr_freeobjects.drr_numobjs);
- break;
- case DRR_WRITE:
- DO64(drr_write.drr_object);
- DO32(drr_write.drr_type);
- DO64(drr_write.drr_offset);
- DO64(drr_write.drr_length);
- break;
- case DRR_FREE:
- DO64(drr_free.drr_object);
- DO64(drr_free.drr_offset);
- DO64(drr_free.drr_length);
- break;
- case DRR_END:
- DO64(drr_end.drr_checksum.zc_word[0]);
- DO64(drr_end.drr_checksum.zc_word[1]);
- DO64(drr_end.drr_checksum.zc_word[2]);
- DO64(drr_end.drr_checksum.zc_word[3]);
- break;
- }
-#undef DO64
-#undef DO32
-}
-
-static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
-{
- int err;
- dmu_tx_t *tx;
-
- err = dmu_object_info(os, drro->drr_object, NULL);
-
- if (err != 0 && err != ENOENT)
- return (EINVAL);
-
- if (drro->drr_type == DMU_OT_NONE ||
- drro->drr_type >= DMU_OT_NUMTYPES ||
- drro->drr_bonustype >= DMU_OT_NUMTYPES ||
- drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
- drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
- P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
- drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
- drro->drr_bonuslen > DN_MAX_BONUSLEN) {
- return (EINVAL);
- }
-
- tx = dmu_tx_create(os);
-
- if (err == ENOENT) {
- /* currently free, want to be allocated */
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_object_claim(os, drro->drr_object,
- drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
- } else {
- /* currently allocated, want to be allocated */
- dmu_tx_hold_bonus(tx, drro->drr_object);
- /*
- * We may change blocksize, so need to
- * hold_write
- */
- dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
-
- err = dmu_object_reclaim(os, drro->drr_object,
- drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
- }
- if (err) {
- dmu_tx_commit(tx);
- return (EINVAL);
- }
-
- dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
- dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
-
- if (drro->drr_bonuslen) {
- dmu_buf_t *db;
- void *data;
- VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
-
- ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
- data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
- if (data == NULL) {
- dmu_tx_commit(tx);
- return (ra->err);
- }
- bcopy(data, db->db_data, drro->drr_bonuslen);
- if (ra->byteswap) {
- dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
- drro->drr_bonuslen);
- }
- dmu_buf_rele(db, FTAG);
- }
- dmu_tx_commit(tx);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
- struct drr_freeobjects *drrfo)
-{
- uint64_t obj;
-
- if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
- return (EINVAL);
-
- for (obj = drrfo->drr_firstobj;
- obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
- (void) dmu_object_next(os, &obj, FALSE, 0)) {
- dmu_tx_t *tx;
- int err;
-
- if (dmu_object_info(os, obj, NULL) != 0)
- continue;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_object_free(os, obj, tx);
- dmu_tx_commit(tx);
- if (err && err != ENOENT)
- return (EINVAL);
- }
- return (0);
-}
-
-static int
-restore_write(struct restorearg *ra, objset_t *os,
- struct drr_write *drrw)
-{
- dmu_tx_t *tx;
- void *data;
- int err;
-
- if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
- drrw->drr_type >= DMU_OT_NUMTYPES)
- return (EINVAL);
-
- data = restore_read(ra, drrw->drr_length);
- if (data == NULL)
- return (ra->err);
-
- if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
- return (EINVAL);
-
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_write(tx, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- if (ra->byteswap)
- dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
- dmu_write(os, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length, data, tx);
- dmu_tx_commit(tx);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-restore_free(struct restorearg *ra, objset_t *os,
- struct drr_free *drrf)
-{
- dmu_tx_t *tx;
- int err;
-
- if (drrf->drr_length != -1ULL &&
- drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
- return (EINVAL);
-
- if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
- return (EINVAL);
-
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_free(tx, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_free_range(os, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length, tx);
- dmu_tx_commit(tx);
- return (err);
-}
-
-void
-dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
-{
- if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
- /*
- * online incremental or new fs: destroy the fs (which
- * may be a clone) that we created
- */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
- if (drc->drc_real_ds != drc->drc_logical_ds) {
- dsl_dataset_close(drc->drc_logical_ds,
- DS_MODE_STANDARD, dmu_recv_tag);
- }
- } else {
- /*
- * offline incremental: rollback to most recent snapshot.
- */
- int lmode = DS_MODE_PRIMARY;
- if (dsl_dataset_tryupgrade(drc->drc_real_ds,
- DS_MODE_PRIMARY, DS_MODE_EXCLUSIVE)) {
- lmode = DS_MODE_EXCLUSIVE;
- (void) dsl_dataset_rollback(drc->drc_real_ds,
- DMU_OST_NONE);
- }
- dsl_dataset_close(drc->drc_real_ds, lmode, FTAG);
- }
-}
-
-/*
- * NB: callers *must* call dmu_recv_end() if this succeeds.
- */
-int
-dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
-{
- struct restorearg ra = { 0 };
- dmu_replay_record_t *drr;
- objset_t *os;
- zio_cksum_t pcksum;
-
- if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
- ra.byteswap = TRUE;
-
- {
- /* compute checksum of drr_begin record */
- dmu_replay_record_t *drr;
- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-
- drr->drr_type = DRR_BEGIN;
- drr->drr_u.drr_begin = *drc->drc_drrb;
- if (ra.byteswap) {
- fletcher_4_incremental_byteswap(drr,
- sizeof (dmu_replay_record_t), &ra.cksum);
- } else {
- fletcher_4_incremental_native(drr,
- sizeof (dmu_replay_record_t), &ra.cksum);
- }
- kmem_free(drr, sizeof (dmu_replay_record_t));
- }
-
- if (ra.byteswap) {
- struct drr_begin *drrb = drc->drc_drrb;
- drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_version = BSWAP_64(drrb->drr_version);
- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
- drrb->drr_type = BSWAP_32(drrb->drr_type);
- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
- }
-
- ra.vp = vp;
- ra.voff = *voffp;
- ra.bufsize = 1<<20;
- ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
-
- /* these were verified in dmu_recv_begin */
- ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
- ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
-
- /*
- * Open the objset we are modifying.
- */
- VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
-
- ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
-
- /*
- * Read records and process them.
- */
- pcksum = ra.cksum;
- while (ra.err == 0 &&
- NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
- if (issig(JUSTLOOKING) && issig(FORREAL)) {
- ra.err = EINTR;
- goto out;
- }
-
- if (ra.byteswap)
- backup_byteswap(drr);
-
- switch (drr->drr_type) {
- case DRR_OBJECT:
- {
- /*
- * We need to make a copy of the record header,
- * because restore_{object,write} may need to
- * restore_read(), which will invalidate drr.
- */
- struct drr_object drro = drr->drr_u.drr_object;
- ra.err = restore_object(&ra, os, &drro);
- break;
- }
- case DRR_FREEOBJECTS:
- {
- struct drr_freeobjects drrfo =
- drr->drr_u.drr_freeobjects;
- ra.err = restore_freeobjects(&ra, os, &drrfo);
- break;
- }
- case DRR_WRITE:
- {
- struct drr_write drrw = drr->drr_u.drr_write;
- ra.err = restore_write(&ra, os, &drrw);
- break;
- }
- case DRR_FREE:
- {
- struct drr_free drrf = drr->drr_u.drr_free;
- ra.err = restore_free(&ra, os, &drrf);
- break;
- }
- case DRR_END:
- {
- struct drr_end drre = drr->drr_u.drr_end;
- /*
- * We compare against the *previous* checksum
- * value, because the stored checksum is of
- * everything before the DRR_END record.
- */
- if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
- ra.err = ECKSUM;
- goto out;
- }
- default:
- ra.err = EINVAL;
- goto out;
- }
- pcksum = ra.cksum;
- }
- ASSERT(ra.err != 0);
-
-out:
- dmu_objset_close(os);
-
- if (ra.err != 0) {
- /*
- * rollback or destroy what we created, so we don't
- * leave it in the restoring state.
- */
- txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
- dmu_recv_abort_cleanup(drc);
- }
-
- kmem_free(ra.buf, ra.bufsize);
- *voffp = ra.voff;
- return (ra.err);
-}
-
-struct recvendsyncarg {
- char *tosnap;
- uint64_t creation_time;
- uint64_t toguid;
-};
-
-static int
-recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvendsyncarg *resa = arg2;
-
- return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
-}
-
-static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvendsyncarg *resa = arg2;
-
- dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
-
- /* set snapshot's creation time and guid */
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
- ds->ds_prev->ds_phys->ds_guid = resa->toguid;
- ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-}
-
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
-{
- int err = 0;
- int lmode;
-
- /*
- * XXX hack; seems the ds is still dirty and
- * dsl_pool_zil_clean() expects it to have a ds_user_ptr (and
- * zil), but clone_swap() can close it.
- */
- txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
-
- if (dsl_dataset_tryupgrade(drc->drc_real_ds,
- DS_MODE_PRIMARY, DS_MODE_EXCLUSIVE)) {
- lmode = DS_MODE_EXCLUSIVE;
- } else {
- dmu_recv_abort_cleanup(drc);
- return (EBUSY);
- }
-
- if (drc->drc_logical_ds != drc->drc_real_ds) {
- if (err == 0 && dsl_dataset_tryupgrade(drc->drc_logical_ds,
- DS_MODE_STANDARD, DS_MODE_EXCLUSIVE)) {
- lmode = DS_MODE_EXCLUSIVE;
- err = dsl_dataset_clone_swap(drc->drc_real_ds,
- drc->drc_logical_ds, drc->drc_force);
- } else {
- lmode = DS_MODE_STANDARD;
- err = EBUSY;
- }
- }
-
- if (err == 0) {
- struct recvendsyncarg resa;
-
- resa.creation_time = drc->drc_drrb->drr_creation_time;
- resa.toguid = drc->drc_drrb->drr_toguid;
- resa.tosnap = drc->drc_tosnap;
-
- err = dsl_sync_task_do(drc->drc_real_ds->ds_dir->dd_pool,
- recv_end_check, recv_end_sync,
- drc->drc_logical_ds, &resa, 3);
- if (err) {
- if (drc->drc_newfs) {
- ASSERT(drc->drc_logical_ds == drc->drc_real_ds);
- (void) dsl_dataset_destroy(drc->drc_real_ds,
- dmu_recv_tag);
- return (err);
- } else {
- (void) dsl_dataset_rollback(drc->drc_logical_ds,
- DMU_OST_NONE);
- }
- }
- }
-
- if (drc->drc_logical_ds != drc->drc_real_ds) {
- /* dsl_dataset_destroy() will close the ds */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
- }
- /* close the hold from dmu_recv_begin */
- dsl_dataset_close(drc->drc_logical_ds, lmode, dmu_recv_tag);
- return (err);
-}
diff --git a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h
deleted file mode 100644
index c2044ba27..000000000
--- a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_CTL_H
-#define _SYS_DMU_CTL_H
-
-#include <sys/types.h>
-
-/* Default directory where the clients search for sockets to connect */
-#define DMU_CTL_DEFAULT_DIR "/var/run/zfs/udmu"
-
-/*
- * These functions are called by the server process.
- *
- * kernel_init() must be called before dctl_server_init().
- * kernel_fini() must not be called before dctl_server_fini().
- *
- * All objsets must be closed and object references be released before calling
- * dctl_server_fini(), otherwise it will return EBUSY.
- *
- * Note: On Solaris, it is highly recommended to either catch or ignore the
- * SIGPIPE signal, otherwise the server process will die if the client is
- * killed.
- */
-int dctl_server_init(const char *cfg_dir, int min_threads,
- int max_free_threads);
-int dctl_server_fini();
-
-/*
- * The following functions are called by the DMU from the server process context
- * (in the worker threads).
- */
-int dctls_copyin(const void *src, void *dest, size_t size);
-int dctls_copyinstr(const char *from, char *to, size_t max,
- size_t *len);
-int dctls_copyout(const void *src, void *dest, size_t size);
-int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp);
-int dctls_fd_write(int fd, const void *src, ssize_t len);
-
-/*
- * These functions are called by the client process (libzfs).
- */
-int dctlc_connect(const char *dir, boolean_t check_subdirs);
-void dctlc_disconnect(int fd);
-
-int dctlc_ioctl(int fd, int32_t request, void *arg);
-
-#endif
diff --git a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h
deleted file mode 100644
index 6b4a564b3..000000000
--- a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_CTL_IMPL_H
-#define _SYS_DMU_CTL_IMPL_H
-
-#include <sys/list.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <pthread.h>
-
-#define SOCKNAME "dmu_socket"
-
-#define DCTL_PROTOCOL_VER 1
-#define DCTL_MAGIC 0xdc71b1070c01dc71ll
-
-/* Message types */
-enum {
- DCTL_IOCTL,
- DCTL_IOCTL_REPLY,
- DCTL_COPYIN,
- DCTL_COPYINSTR,
- DCTL_COPYOUT,
- DCTL_FD_READ,
- DCTL_FD_WRITE,
- DCTL_GEN_REPLY /* generic reply */
-};
-
-/* On-the-wire message */
-typedef struct dctl_cmd {
- uint64_t dcmd_magic;
- int8_t dcmd_version;
- int8_t dcmd_msg;
- uint8_t dcmd_pad[6];
- union {
- struct dcmd_ioctl {
- uint64_t arg;
- int32_t cmd;
- uint8_t pad[4];
- } dcmd_ioctl;
-
- struct dcmd_copy_req {
- uint64_t ptr;
- uint64_t size;
- } dcmd_copy;
-
- struct dcmd_fd_req {
- int64_t size;
- int32_t fd;
- uint8_t pad[4];
- } dcmd_fd_io;
-
- struct dcmd_reply {
- uint64_t size; /* used by reply to DCTL_COPYINSTR,
- DCTL_FD_READ and DCTL_FD_WRITE */
- int32_t rc; /* return code */
- uint8_t pad[4];
- } dcmd_reply;
- } u;
-} dctl_cmd_t;
-
-#define DCTL_CMD_HEADER_SIZE (sizeof(uint64_t) + sizeof(uint8_t))
-
-/*
- * The following definitions are only used by the server code.
- */
-
-#define LISTEN_BACKLOG 5
-
-/* Worker thread data */
-typedef struct wthr_info {
- list_node_t wthr_node;
- pthread_t wthr_id;
- boolean_t wthr_exit; /* termination flag */
- boolean_t wthr_free;
-} wthr_info_t;
-
-/* Control socket data */
-typedef struct dctl_sock_info {
- pthread_mutex_t dsi_mtx;
- char *dsi_path;
- struct sockaddr_un dsi_addr;
- int dsi_fd;
-} dctl_sock_info_t;
-
-typedef void *thr_func_t(void *);
-
-/* Thread pool data */
-typedef struct dctl_thr_info {
- thr_func_t *dti_thr_func;
-
- pthread_mutex_t dti_mtx; /* protects the thread lists and dti_free */
- list_t dti_list; /* list of threads in the thread pool */
- list_t dti_join_list; /* list of threads that are waiting to be
- joined */
- int dti_free; /* number of free worker threads */
-
- int dti_min;
- int dti_max_free;
-
- boolean_t dti_exit; /* global termination flag */
-} dctl_thr_info_t;
-
-/* Messaging functions functions */
-int dctl_read_msg(int fd, dctl_cmd_t *cmd);
-int dctl_send_msg(int fd, dctl_cmd_t *cmd);
-
-int dctl_read_data(int fd, void *ptr, size_t size);
-int dctl_send_data(int fd, const void *ptr, size_t size);
-
-/* Thread pool functions */
-int dctl_thr_pool_create(int min_thr, int max_free_thr,
- thr_func_t *thr_func);
-void dctl_thr_pool_stop();
-
-void dctl_thr_join();
-void dctl_thr_die(wthr_info_t *thr);
-void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free);
-
-#endif
diff --git a/zfs/lib/libdmu-ctl/rrwlock.c b/zfs/lib/libdmu-ctl/rrwlock.c
deleted file mode 100644
index c46ed8155..000000000
--- a/zfs/lib/libdmu-ctl/rrwlock.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)rrwlock.c 1.1 07/10/24 SMI"
-
-#include <sys/refcount.h>
-#include <sys/rrwlock.h>
-
-/*
- * This file contains the implementation of a re-entrant read
- * reader/writer lock (aka "rrwlock").
- *
- * This is a normal reader/writer lock with the additional feature
- * of allowing threads who have already obtained a read lock to
- * re-enter another read lock (re-entrant read) - even if there are
- * waiting writers.
- *
- * Callers who have not obtained a read lock give waiting writers priority.
- *
- * The rrwlock_t lock does not allow re-entrant writers, nor does it
- * allow a re-entrant mix of reads and writes (that is, it does not
- * allow a caller who has already obtained a read lock to be able to
- * then grab a write lock without first dropping all read locks, and
- * vice versa).
- *
- * The rrwlock_t uses tsd (thread specific data) to keep a list of
- * nodes (rrw_node_t), where each node keeps track of which specific
- * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
- * should be rare, a thread that grabs multiple reads on the same rrwlock_t
- * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
- * tsd list can represent a different rrwlock_t. This allows a thread
- * to enter multiple and unique rrwlock_ts for read locks at the same time.
- *
- * Since using tsd exposes some overhead, the rrwlock_t only needs to
- * keep tsd data when writers are waiting. If no writers are waiting, then
- * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
- * is needed. Once a writer attempts to grab the lock, readers then
- * keep tsd data and bump the linked readers count (rr_linked_rcount).
- *
- * If there are waiting writers and there are anonymous readers, then a
- * reader doesn't know if it is a re-entrant lock. But since it may be one,
- * we allow the read to proceed (otherwise it could deadlock). Since once
- * waiting writers are active, readers no longer bump the anonymous count,
- * the anonymous readers will eventually flush themselves out. At this point,
- * readers will be able to tell if they are a re-entrant lock (have a
- * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
- * we must let the proceed. If they are not, then the reader blocks for the
- * waiting writers. Hence, we do not starve writers.
- */
-
-/* global key for TSD */
-uint_t rrw_tsd_key;
-
-typedef struct rrw_node {
- struct rrw_node *rn_next;
- rrwlock_t *rn_rrl;
-} rrw_node_t;
-
-static rrw_node_t *
-rrn_find(rrwlock_t *rrl)
-{
- rrw_node_t *rn;
-
- if (refcount_count(&rrl->rr_linked_rcount) == 0)
- return (NULL);
-
- for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
- if (rn->rn_rrl == rrl)
- return (rn);
- }
- return (NULL);
-}
-
-/*
- * Add a node to the head of the singly linked list.
- */
-static void
-rrn_add(rrwlock_t *rrl)
-{
- rrw_node_t *rn;
-
- rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
- rn->rn_rrl = rrl;
- rn->rn_next = tsd_get(rrw_tsd_key);
- VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
-}
-
-/*
- * If a node is found for 'rrl', then remove the node from this
- * thread's list and return TRUE; otherwise return FALSE.
- */
-static boolean_t
-rrn_find_and_remove(rrwlock_t *rrl)
-{
- rrw_node_t *rn;
- rrw_node_t *prev = NULL;
-
- if (refcount_count(&rrl->rr_linked_rcount) == 0)
- return (NULL);
-
- for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
- if (rn->rn_rrl == rrl) {
- if (prev)
- prev->rn_next = rn->rn_next;
- else
- VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
- kmem_free(rn, sizeof (*rn));
- return (B_TRUE);
- }
- prev = rn;
- }
- return (B_FALSE);
-}
-
-void
-rrw_init(rrwlock_t *rrl)
-{
- mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
- rrl->rr_writer = NULL;
- refcount_create(&rrl->rr_anon_rcount);
- refcount_create(&rrl->rr_linked_rcount);
- rrl->rr_writer_wanted = B_FALSE;
-}
-
-void
-rrw_destroy(rrwlock_t *rrl)
-{
- mutex_destroy(&rrl->rr_lock);
- cv_destroy(&rrl->rr_cv);
- ASSERT(rrl->rr_writer == NULL);
- refcount_destroy(&rrl->rr_anon_rcount);
- refcount_destroy(&rrl->rr_linked_rcount);
-}
-
-static void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
-{
- mutex_enter(&rrl->rr_lock);
- ASSERT(rrl->rr_writer != curthread);
- ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
-
- while (rrl->rr_writer || (rrl->rr_writer_wanted &&
- refcount_is_zero(&rrl->rr_anon_rcount) &&
- rrn_find(rrl) == NULL))
- cv_wait(&rrl->rr_cv, &rrl->rr_lock);
-
- if (rrl->rr_writer_wanted) {
- /* may or may not be a re-entrant enter */
- rrn_add(rrl);
- (void) refcount_add(&rrl->rr_linked_rcount, tag);
- } else {
- (void) refcount_add(&rrl->rr_anon_rcount, tag);
- }
- ASSERT(rrl->rr_writer == NULL);
- mutex_exit(&rrl->rr_lock);
-}
-
-static void
-rrw_enter_write(rrwlock_t *rrl)
-{
- mutex_enter(&rrl->rr_lock);
- ASSERT(rrl->rr_writer != curthread);
-
- while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
- refcount_count(&rrl->rr_linked_rcount) > 0 ||
- rrl->rr_writer != NULL) {
- rrl->rr_writer_wanted = B_TRUE;
- cv_wait(&rrl->rr_cv, &rrl->rr_lock);
- }
- rrl->rr_writer_wanted = B_FALSE;
- rrl->rr_writer = curthread;
- mutex_exit(&rrl->rr_lock);
-}
-
-void
-rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
-{
- if (rw == RW_READER)
- rrw_enter_read(rrl, tag);
- else
- rrw_enter_write(rrl);
-}
-
-void
-rrw_exit(rrwlock_t *rrl, void *tag)
-{
- mutex_enter(&rrl->rr_lock);
- ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
- !refcount_is_zero(&rrl->rr_linked_rcount) ||
- rrl->rr_writer != NULL);
-
- if (rrl->rr_writer == NULL) {
- if (rrn_find_and_remove(rrl)) {
- if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
- cv_broadcast(&rrl->rr_cv);
-
- } else {
- if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
- cv_broadcast(&rrl->rr_cv);
- }
- } else {
- ASSERT(rrl->rr_writer == curthread);
- ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
- refcount_is_zero(&rrl->rr_linked_rcount));
- rrl->rr_writer = NULL;
- cv_broadcast(&rrl->rr_cv);
- }
- mutex_exit(&rrl->rr_lock);
-}
-
-boolean_t
-rrw_held(rrwlock_t *rrl, krw_t rw)
-{
- boolean_t held;
-
- mutex_enter(&rrl->rr_lock);
- if (rw == RW_WRITER) {
- held = (rrl->rr_writer == curthread);
- } else {
- held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
- !refcount_is_zero(&rrl->rr_linked_rcount));
- }
- mutex_exit(&rrl->rr_lock);
-
- return (held);
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_acl.c b/zfs/lib/libdmu-ctl/zfs_acl.c
deleted file mode 100644
index cc2f97e1b..000000000
--- a/zfs/lib/libdmu-ctl/zfs_acl.c
+++ /dev/null
@@ -1,2641 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_acl.c 1.25 08/04/08 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/sid.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/sdt.h>
-#include <sys/fs/zfs.h>
-#include <sys/mode.h>
-#include <sys/policy.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_fuid.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/zap.h>
-#include "fs/fs_subr.h"
-#include <acl/acl_common.h>
-
-#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
-#define DENY ACE_ACCESS_DENIED_ACE_TYPE
-#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
-
-#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
-#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
- ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
-#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
-
-#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
- ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
- ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
- ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
-
-#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
- ACE_WRITE_OWNER)
-
-#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
- ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
- ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
- ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
-
-#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
- ZFS_ACL_PROTECTED)
-
-#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
- ZFS_ACL_OBJ_ACE)
-
-static uint16_t
-zfs_ace_v0_get_type(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_type);
-}
-
-static uint16_t
-zfs_ace_v0_get_flags(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_flags);
-}
-
-static uint32_t
-zfs_ace_v0_get_mask(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_access_mask);
-}
-
-static uint64_t
-zfs_ace_v0_get_who(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_fuid);
-}
-
-static void
-zfs_ace_v0_set_type(void *acep, uint16_t type)
-{
- ((zfs_oldace_t *)acep)->z_type = type;
-}
-
-static void
-zfs_ace_v0_set_flags(void *acep, uint16_t flags)
-{
- ((zfs_oldace_t *)acep)->z_flags = flags;
-}
-
-static void
-zfs_ace_v0_set_mask(void *acep, uint32_t mask)
-{
- ((zfs_oldace_t *)acep)->z_access_mask = mask;
-}
-
-static void
-zfs_ace_v0_set_who(void *acep, uint64_t who)
-{
- ((zfs_oldace_t *)acep)->z_fuid = who;
-}
-
-/*ARGSUSED*/
-static size_t
-zfs_ace_v0_size(void *acep)
-{
- return (sizeof (zfs_oldace_t));
-}
-
-static size_t
-zfs_ace_v0_abstract_size(void)
-{
- return (sizeof (zfs_oldace_t));
-}
-
-static int
-zfs_ace_v0_mask_off(void)
-{
- return (offsetof(zfs_oldace_t, z_access_mask));
-}
-
-/*ARGSUSED*/
-static int
-zfs_ace_v0_data(void *acep, void **datap)
-{
- *datap = NULL;
- return (0);
-}
-
-static acl_ops_t zfs_acl_v0_ops = {
- zfs_ace_v0_get_mask,
- zfs_ace_v0_set_mask,
- zfs_ace_v0_get_flags,
- zfs_ace_v0_set_flags,
- zfs_ace_v0_get_type,
- zfs_ace_v0_set_type,
- zfs_ace_v0_get_who,
- zfs_ace_v0_set_who,
- zfs_ace_v0_size,
- zfs_ace_v0_abstract_size,
- zfs_ace_v0_mask_off,
- zfs_ace_v0_data
-};
-
-static uint16_t
-zfs_ace_fuid_get_type(void *acep)
-{
- return (((zfs_ace_hdr_t *)acep)->z_type);
-}
-
-static uint16_t
-zfs_ace_fuid_get_flags(void *acep)
-{
- return (((zfs_ace_hdr_t *)acep)->z_flags);
-}
-
-static uint32_t
-zfs_ace_fuid_get_mask(void *acep)
-{
- return (((zfs_ace_hdr_t *)acep)->z_access_mask);
-}
-
-static uint64_t
-zfs_ace_fuid_get_who(void *args)
-{
- uint16_t entry_type;
- zfs_ace_t *acep = args;
-
- entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-
- if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
- entry_type == ACE_EVERYONE)
- return (-1);
- return (((zfs_ace_t *)acep)->z_fuid);
-}
-
-static void
-zfs_ace_fuid_set_type(void *acep, uint16_t type)
-{
- ((zfs_ace_hdr_t *)acep)->z_type = type;
-}
-
-static void
-zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
-{
- ((zfs_ace_hdr_t *)acep)->z_flags = flags;
-}
-
-static void
-zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
-{
- ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
-}
-
-static void
-zfs_ace_fuid_set_who(void *arg, uint64_t who)
-{
- zfs_ace_t *acep = arg;
-
- uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-
- if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
- entry_type == ACE_EVERYONE)
- return;
- acep->z_fuid = who;
-}
-
-static size_t
-zfs_ace_fuid_size(void *acep)
-{
- zfs_ace_hdr_t *zacep = acep;
- uint16_t entry_type;
-
- switch (zacep->z_type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- return (sizeof (zfs_object_ace_t));
- case ALLOW:
- case DENY:
- entry_type =
- (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
- if (entry_type == ACE_OWNER ||
- entry_type == (ACE_GROUP | ACE_IDENTIFIER_GROUP) ||
- entry_type == ACE_EVERYONE)
- return (sizeof (zfs_ace_hdr_t));
- /*FALLTHROUGH*/
- default:
- return (sizeof (zfs_ace_t));
- }
-}
-
-static size_t
-zfs_ace_fuid_abstract_size(void)
-{
- return (sizeof (zfs_ace_hdr_t));
-}
-
-static int
-zfs_ace_fuid_mask_off(void)
-{
- return (offsetof(zfs_ace_hdr_t, z_access_mask));
-}
-
-static int
-zfs_ace_fuid_data(void *acep, void **datap)
-{
- zfs_ace_t *zacep = acep;
- zfs_object_ace_t *zobjp;
-
- switch (zacep->z_hdr.z_type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- zobjp = acep;
- *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
- return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
- default:
- *datap = NULL;
- return (0);
- }
-}
-
-static acl_ops_t zfs_acl_fuid_ops = {
- zfs_ace_fuid_get_mask,
- zfs_ace_fuid_set_mask,
- zfs_ace_fuid_get_flags,
- zfs_ace_fuid_set_flags,
- zfs_ace_fuid_get_type,
- zfs_ace_fuid_set_type,
- zfs_ace_fuid_get_who,
- zfs_ace_fuid_set_who,
- zfs_ace_fuid_size,
- zfs_ace_fuid_abstract_size,
- zfs_ace_fuid_mask_off,
- zfs_ace_fuid_data
-};
-
-static int
-zfs_acl_version(int version)
-{
- if (version < ZPL_VERSION_FUID)
- return (ZFS_ACL_VERSION_INITIAL);
- else
- return (ZFS_ACL_VERSION_FUID);
-}
-
-static int
-zfs_acl_version_zp(znode_t *zp)
-{
- return (zfs_acl_version(zp->z_zfsvfs->z_version));
-}
-
-static zfs_acl_t *
-zfs_acl_alloc(int vers)
-{
- zfs_acl_t *aclp;
-
- aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
- list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
- offsetof(zfs_acl_node_t, z_next));
- aclp->z_version = vers;
- if (vers == ZFS_ACL_VERSION_FUID)
- aclp->z_ops = zfs_acl_fuid_ops;
- else
- aclp->z_ops = zfs_acl_v0_ops;
- return (aclp);
-}
-
-static zfs_acl_node_t *
-zfs_acl_node_alloc(size_t bytes)
-{
- zfs_acl_node_t *aclnode;
-
- aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
- if (bytes) {
- aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
- aclnode->z_allocdata = aclnode->z_acldata;
- aclnode->z_allocsize = bytes;
- aclnode->z_size = bytes;
- }
-
- return (aclnode);
-}
-
-static void
-zfs_acl_node_free(zfs_acl_node_t *aclnode)
-{
- if (aclnode->z_allocsize)
- kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
- kmem_free(aclnode, sizeof (zfs_acl_node_t));
-}
-
-static void
-zfs_acl_release_nodes(zfs_acl_t *aclp)
-{
- zfs_acl_node_t *aclnode;
-
- while (aclnode = list_head(&aclp->z_acl)) {
- list_remove(&aclp->z_acl, aclnode);
- zfs_acl_node_free(aclnode);
- }
- aclp->z_acl_count = 0;
- aclp->z_acl_bytes = 0;
-}
-
-void
-zfs_acl_free(zfs_acl_t *aclp)
-{
- zfs_acl_release_nodes(aclp);
- list_destroy(&aclp->z_acl);
- kmem_free(aclp, sizeof (zfs_acl_t));
-}
-
-static boolean_t
-zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
-{
- /*
- * first check type of entry
- */
-
- switch (iflags & ACE_TYPE_FLAGS) {
- case ACE_OWNER:
- case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
- case ACE_IDENTIFIER_GROUP:
- case ACE_EVERYONE:
- case 0: /* User entry */
- break;
- default:
- return (B_FALSE);
-
- }
-
- /*
- * next check inheritance level flags
- */
-
- if (type != ALLOW && type > MAX_ACE_TYPE) {
- return (B_FALSE);
- }
-
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- if (aclp->z_version < ZFS_ACL_VERSION_FUID)
- return (B_FALSE);
- aclp->z_hints |= ZFS_ACL_OBJ_ACE;
- }
-
- /*
- * Only directories should have inheritance flags.
- */
- if (obj_type != VDIR && (iflags &
- (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
- ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
- return (B_FALSE);
- }
-
- if (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
- aclp->z_hints |= ZFS_INHERIT_ACE;
-
- if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
- if ((iflags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) == 0) {
- return (B_FALSE);
- }
- }
-
- return (B_TRUE);
-}
-
-static void *
-zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
- uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
-{
- zfs_acl_node_t *aclnode;
-
- if (start == NULL) {
- aclnode = list_head(&aclp->z_acl);
- if (aclnode == NULL)
- return (NULL);
-
- aclp->z_next_ace = aclnode->z_acldata;
- aclp->z_curr_node = aclnode;
- aclnode->z_ace_idx = 0;
- }
-
- aclnode = aclp->z_curr_node;
-
- if (aclnode == NULL)
- return (NULL);
-
- if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
- aclnode = list_next(&aclp->z_acl, aclnode);
- if (aclnode == NULL)
- return (NULL);
- else {
- aclp->z_curr_node = aclnode;
- aclnode->z_ace_idx = 0;
- aclp->z_next_ace = aclnode->z_acldata;
- }
- }
-
- if (aclnode->z_ace_idx < aclnode->z_ace_count) {
- void *acep = aclp->z_next_ace;
- *iflags = aclp->z_ops.ace_flags_get(acep);
- *type = aclp->z_ops.ace_type_get(acep);
- *access_mask = aclp->z_ops.ace_mask_get(acep);
- *who = aclp->z_ops.ace_who_get(acep);
- aclp->z_next_ace = (caddr_t)aclp->z_next_ace +
- aclp->z_ops.ace_size(acep);
- aclnode->z_ace_idx++;
- return ((void *)acep);
- }
- return (NULL);
-}
-
-/*ARGSUSED*/
-static uint64_t
-zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
- uint16_t *flags, uint16_t *type, uint32_t *mask)
-{
- zfs_acl_t *aclp = datap;
- zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
- uint64_t who;
-
- acep = zfs_acl_next_ace(aclp, acep, &who, mask,
- flags, type);
- return ((uint64_t)(uintptr_t)acep);
-}
-
-static zfs_acl_node_t *
-zfs_acl_curr_node(zfs_acl_t *aclp)
-{
- ASSERT(aclp->z_curr_node);
- return (aclp->z_curr_node);
-}
-
-/*
- * Copy ACE to internal ZFS format.
- * While processing the ACL each ACE will be validated for correctness.
- * ACE FUIDs will be created later.
- */
-int
-zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
- zfs_ace_t *z_acl, int aclcnt, size_t *size)
-{
- int i;
- uint16_t entry_type;
- zfs_ace_t *aceptr = z_acl;
- ace_t *acep = datap;
- zfs_object_ace_t *zobjacep;
- ace_object_t *aceobjp;
-
- for (i = 0; i != aclcnt; i++) {
- aceptr->z_hdr.z_access_mask = acep->a_access_mask;
- aceptr->z_hdr.z_flags = acep->a_flags;
- aceptr->z_hdr.z_type = acep->a_type;
- entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
- if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
- entry_type != ACE_EVERYONE) {
- if (!aclp->z_has_fuids)
- aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who);
- aceptr->z_fuid = (uint64_t)acep->a_who;
- }
-
- /*
- * Make sure ACE is valid
- */
- if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
- aceptr->z_hdr.z_flags) != B_TRUE)
- return (EINVAL);
-
- switch (acep->a_type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- zobjacep = (zfs_object_ace_t *)aceptr;
- aceobjp = (ace_object_t *)acep;
-
- bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
- sizeof (aceobjp->a_obj_type));
- bcopy(aceobjp->a_inherit_obj_type,
- zobjacep->z_inherit_type,
- sizeof (aceobjp->a_inherit_obj_type));
- acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
- break;
- default:
- acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
- }
-
- aceptr = (zfs_ace_t *)((caddr_t)aceptr +
- aclp->z_ops.ace_size(aceptr));
- }
-
- *size = (caddr_t)aceptr - (caddr_t)z_acl;
-
- return (0);
-}
-
-/*
- * Copy ZFS ACEs to fixed size ace_t layout
- */
-static void
-zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
- void *datap, int filter)
-{
- uint64_t who;
- uint32_t access_mask;
- uint16_t iflags, type;
- zfs_ace_hdr_t *zacep = NULL;
- ace_t *acep = datap;
- ace_object_t *objacep;
- zfs_object_ace_t *zobjacep;
- size_t ace_size;
- uint16_t entry_type;
-
- while (zacep = zfs_acl_next_ace(aclp, zacep,
- &who, &access_mask, &iflags, &type)) {
-
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- if (filter) {
- continue;
- }
- zobjacep = (zfs_object_ace_t *)zacep;
- objacep = (ace_object_t *)acep;
- bcopy(zobjacep->z_object_type,
- objacep->a_obj_type,
- sizeof (zobjacep->z_object_type));
- bcopy(zobjacep->z_inherit_type,
- objacep->a_inherit_obj_type,
- sizeof (zobjacep->z_inherit_type));
- ace_size = sizeof (ace_object_t);
- break;
- default:
- ace_size = sizeof (ace_t);
- break;
- }
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
- if ((entry_type != ACE_OWNER &&
- entry_type != (ACE_GROUP | ACE_IDENTIFIER_GROUP) &&
- entry_type != ACE_EVERYONE)) {
- acep->a_who = zfs_fuid_map_id(zfsvfs, who,
- cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
- ZFS_ACE_GROUP : ZFS_ACE_USER);
- } else {
- acep->a_who = (uid_t)(int64_t)who;
- }
- acep->a_access_mask = access_mask;
- acep->a_flags = iflags;
- acep->a_type = type;
- acep = (ace_t *)((caddr_t)acep + ace_size);
- }
-}
-
-static int
-zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
- zfs_oldace_t *z_acl, int aclcnt, size_t *size)
-{
- int i;
- zfs_oldace_t *aceptr = z_acl;
-
- for (i = 0; i != aclcnt; i++, aceptr++) {
- aceptr->z_access_mask = acep[i].a_access_mask;
- aceptr->z_type = acep[i].a_type;
- aceptr->z_flags = acep[i].a_flags;
- aceptr->z_fuid = acep[i].a_who;
- /*
- * Make sure ACE is valid
- */
- if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
- aceptr->z_flags) != B_TRUE)
- return (EINVAL);
- }
- *size = (caddr_t)aceptr - (caddr_t)z_acl;
- return (0);
-}
-
-/*
- * convert old ACL format to new
- */
-void
-zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
-{
- zfs_oldace_t *oldaclp;
- int i;
- uint16_t type, iflags;
- uint32_t access_mask;
- uint64_t who;
- void *cookie = NULL;
- zfs_acl_node_t *newaclnode;
-
- ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
- /*
- * First create the ACE in a contiguous piece of memory
- * for zfs_copy_ace_2_fuid().
- *
- * We only convert an ACL once, so this won't happen
- * everytime.
- */
- oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
- KM_SLEEP);
- i = 0;
- while (cookie = zfs_acl_next_ace(aclp, cookie, &who,
- &access_mask, &iflags, &type)) {
- oldaclp[i].z_flags = iflags;
- oldaclp[i].z_type = type;
- oldaclp[i].z_fuid = who;
- oldaclp[i++].z_access_mask = access_mask;
- }
-
- newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
- sizeof (zfs_object_ace_t));
- aclp->z_ops = zfs_acl_fuid_ops;
- VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp,
- newaclnode->z_acldata, aclp->z_acl_count,
- &newaclnode->z_size) == 0);
- newaclnode->z_ace_count = aclp->z_acl_count;
- aclp->z_version = ZFS_ACL_VERSION;
- kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
-
- /*
- * Release all previous ACL nodes
- */
-
- zfs_acl_release_nodes(aclp);
-
- list_insert_head(&aclp->z_acl, newaclnode);
-
- aclp->z_acl_bytes = newaclnode->z_size;
- aclp->z_acl_count = newaclnode->z_ace_count;
-
-}
-
-/*
- * Convert unix access mask to v4 access mask
- */
-static uint32_t
-zfs_unix_to_v4(uint32_t access_mask)
-{
- uint32_t new_mask = 0;
-
- if (access_mask & S_IXOTH)
- new_mask |= ACE_EXECUTE;
- if (access_mask & S_IWOTH)
- new_mask |= ACE_WRITE_DATA;
- if (access_mask & S_IROTH)
- new_mask |= ACE_READ_DATA;
- return (new_mask);
-}
-
-static void
-zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
- uint16_t access_type, uint64_t fuid, uint16_t entry_type)
-{
- uint16_t type = entry_type & ACE_TYPE_FLAGS;
-
- aclp->z_ops.ace_mask_set(acep, access_mask);
- aclp->z_ops.ace_type_set(acep, access_type);
- aclp->z_ops.ace_flags_set(acep, entry_type);
- if ((type != ACE_OWNER && type != (ACE_GROUP | ACE_IDENTIFIER_GROUP) &&
- type != ACE_EVERYONE))
- aclp->z_ops.ace_who_set(acep, fuid);
-}
-
-/*
- * Determine mode of file based on ACL.
- * Also, create FUIDs for any User/Group ACEs
- */
-static uint64_t
-zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
- zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
-{
- int entry_type;
- mode_t mode;
- mode_t seen = 0;
- zfs_ace_hdr_t *acep = NULL;
- uint64_t who;
- uint16_t iflags, type;
- uint32_t access_mask;
-
- mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
-
- while (acep = zfs_acl_next_ace(aclp, acep, &who,
- &access_mask, &iflags, &type)) {
-
- /*
- * Skip over inherit only ACEs
- */
- if (iflags & ACE_INHERIT_ONLY_ACE)
- continue;
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
-
- if (entry_type == ACE_OWNER) {
- if ((access_mask & ACE_READ_DATA) &&
- (!(seen & S_IRUSR))) {
- seen |= S_IRUSR;
- if (type == ALLOW) {
- mode |= S_IRUSR;
- }
- }
- if ((access_mask & ACE_WRITE_DATA) &&
- (!(seen & S_IWUSR))) {
- seen |= S_IWUSR;
- if (type == ALLOW) {
- mode |= S_IWUSR;
- }
- }
- if ((access_mask & ACE_EXECUTE) &&
- (!(seen & S_IXUSR))) {
- seen |= S_IXUSR;
- if (type == ALLOW) {
- mode |= S_IXUSR;
- }
- }
- } else if (entry_type == OWNING_GROUP) {
- if ((access_mask & ACE_READ_DATA) &&
- (!(seen & S_IRGRP))) {
- seen |= S_IRGRP;
- if (type == ALLOW) {
- mode |= S_IRGRP;
- }
- }
- if ((access_mask & ACE_WRITE_DATA) &&
- (!(seen & S_IWGRP))) {
- seen |= S_IWGRP;
- if (type == ALLOW) {
- mode |= S_IWGRP;
- }
- }
- if ((access_mask & ACE_EXECUTE) &&
- (!(seen & S_IXGRP))) {
- seen |= S_IXGRP;
- if (type == ALLOW) {
- mode |= S_IXGRP;
- }
- }
- } else if (entry_type == ACE_EVERYONE) {
- if ((access_mask & ACE_READ_DATA)) {
- if (!(seen & S_IRUSR)) {
- seen |= S_IRUSR;
- if (type == ALLOW) {
- mode |= S_IRUSR;
- }
- }
- if (!(seen & S_IRGRP)) {
- seen |= S_IRGRP;
- if (type == ALLOW) {
- mode |= S_IRGRP;
- }
- }
- if (!(seen & S_IROTH)) {
- seen |= S_IROTH;
- if (type == ALLOW) {
- mode |= S_IROTH;
- }
- }
- }
- if ((access_mask & ACE_WRITE_DATA)) {
- if (!(seen & S_IWUSR)) {
- seen |= S_IWUSR;
- if (type == ALLOW) {
- mode |= S_IWUSR;
- }
- }
- if (!(seen & S_IWGRP)) {
- seen |= S_IWGRP;
- if (type == ALLOW) {
- mode |= S_IWGRP;
- }
- }
- if (!(seen & S_IWOTH)) {
- seen |= S_IWOTH;
- if (type == ALLOW) {
- mode |= S_IWOTH;
- }
- }
- }
- if ((access_mask & ACE_EXECUTE)) {
- if (!(seen & S_IXUSR)) {
- seen |= S_IXUSR;
- if (type == ALLOW) {
- mode |= S_IXUSR;
- }
- }
- if (!(seen & S_IXGRP)) {
- seen |= S_IXGRP;
- if (type == ALLOW) {
- mode |= S_IXGRP;
- }
- }
- if (!(seen & S_IXOTH)) {
- seen |= S_IXOTH;
- if (type == ALLOW) {
- mode |= S_IXOTH;
- }
- }
- }
- }
- /*
- * Now handle FUID create for user/group ACEs
- */
- if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
- aclp->z_ops.ace_who_set(acep,
- zfs_fuid_create(zp->z_zfsvfs, who, cr,
- (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP,
- tx, fuidp));
- }
- }
- return (mode);
-}
-
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
-{
- zfs_acl_t *aclp;
- zfs_acl_node_t *aclnode;
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-
- /*
- * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
- * Version 0 didn't have a size field, only a count.
- */
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
- aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
- } else {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
- aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
- }
-
- aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
- aclnode->z_ace_count = aclp->z_acl_count;
- if (will_modify) {
- bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
- aclp->z_acl_bytes);
- } else {
- aclnode->z_size = aclp->z_acl_bytes;
- aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
- }
-
- list_insert_head(&aclp->z_acl, aclnode);
-
- return (aclp);
-}
-
-/*
- * Read an external acl object.
- */
-static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
-{
- uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
- zfs_acl_t *aclp;
- size_t aclsize;
- size_t acl_count;
- zfs_acl_node_t *aclnode;
- int error;
-
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
- *aclpp = zfs_acl_node_read_internal(zp, will_modify);
- return (0);
- }
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- zfs_acl_phys_v0_t *zacl0 =
- (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
-
- aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
- acl_count = zacl0->z_acl_count;
- } else {
- aclsize = zp->z_phys->zp_acl.z_acl_size;
- acl_count = zp->z_phys->zp_acl.z_acl_count;
- if (aclsize == 0)
- aclsize = acl_count * sizeof (zfs_ace_t);
- }
- aclnode = zfs_acl_node_alloc(aclsize);
- list_insert_head(&aclp->z_acl, aclnode);
- error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
- aclsize, aclnode->z_acldata);
- aclnode->z_ace_count = acl_count;
- aclp->z_acl_count = acl_count;
- aclp->z_acl_bytes = aclsize;
-
- if (error != 0) {
- zfs_acl_free(aclp);
- return (error);
- }
-
- *aclpp = aclp;
- return (0);
-}
-
-/*
- * common code for setting ACLs.
- *
- * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
- * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
- * already checked the acl and knows whether to inherit.
- */
-int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
- zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
-{
- int error;
- znode_phys_t *zphys = zp->z_phys;
- zfs_acl_phys_t *zacl = &zphys->zp_acl;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
- uint64_t off = 0;
- dmu_object_type_t otype;
- zfs_acl_node_t *aclnode;
-
- ASSERT(MUTEX_HELD(&zp->z_lock));
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
- zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx);
-
- /*
- * Decide which opbject type to use. If we are forced to
- * use old ACL format than transform ACL into zfs_oldace_t
- * layout.
- */
- if (!zfsvfs->z_use_fuids) {
- otype = DMU_OT_OLDACL;
- } else {
- if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
- (zfsvfs->z_version >= ZPL_VERSION_FUID))
- zfs_acl_xform(zp, aclp);
- ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
- otype = DMU_OT_ACL;
- }
-
- if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- /*
- * If ACL was previously external and we are now
- * converting to new ACL format then release old
- * ACL object and create a new one.
- */
- if (aoid && aclp->z_version != zacl->z_acl_version) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- aoid = 0;
- }
- if (aoid == 0) {
- aoid = dmu_object_alloc(zfsvfs->z_os,
- otype, aclp->z_acl_bytes,
- otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
- otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
- } else {
- (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
- aclp->z_acl_bytes, 0, tx);
- }
- zphys->zp_acl.z_acl_extern_obj = aoid;
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- dmu_write(zfsvfs->z_os, aoid, off,
- aclnode->z_size, aclnode->z_acldata, tx);
- off += aclnode->z_size;
- }
- } else {
- void *start = zacl->z_ace_data;
- /*
- * Migrating back embedded?
- */
- if (zphys->zp_acl.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- zphys->zp_acl.z_acl_extern_obj = 0;
- }
-
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- bcopy(aclnode->z_acldata, start, aclnode->z_size);
- start = (caddr_t)start + aclnode->z_size;
- }
- }
-
- /*
- * If Old version then swap count/bytes to match old
- * layout of znode_acl_phys_t.
- */
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- zphys->zp_acl.z_acl_size = aclp->z_acl_count;
- zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
- } else {
- zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
- zphys->zp_acl.z_acl_count = aclp->z_acl_count;
- }
-
- zphys->zp_acl.z_acl_version = aclp->z_version;
-
- /*
- * Replace ACL wide bits, but first clear them.
- */
- zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
-
- zp->z_phys->zp_flags |= aclp->z_hints;
-
- if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
- zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
-
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
- return (0);
-}
-
-/*
- * Update access mask for prepended ACE
- *
- * This applies the "groupmask" value for aclmode property.
- */
-static void
-zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep,
- mode_t mode, uint64_t owner)
-{
- int rmask, wmask, xmask;
- int user_ace;
- uint16_t aceflags;
- uint32_t origmask, acepmask;
- uint64_t fuid;
-
- aceflags = aclp->z_ops.ace_flags_get(acep);
- fuid = aclp->z_ops.ace_who_get(acep);
- origmask = aclp->z_ops.ace_mask_get(origacep);
- acepmask = aclp->z_ops.ace_mask_get(acep);
-
- user_ace = (!(aceflags &
- (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
-
- if (user_ace && (fuid == owner)) {
- rmask = S_IRUSR;
- wmask = S_IWUSR;
- xmask = S_IXUSR;
- } else {
- rmask = S_IRGRP;
- wmask = S_IWGRP;
- xmask = S_IXGRP;
- }
-
- if (origmask & ACE_READ_DATA) {
- if (mode & rmask) {
- acepmask &= ~ACE_READ_DATA;
- } else {
- acepmask |= ACE_READ_DATA;
- }
- }
-
- if (origmask & ACE_WRITE_DATA) {
- if (mode & wmask) {
- acepmask &= ~ACE_WRITE_DATA;
- } else {
- acepmask |= ACE_WRITE_DATA;
- }
- }
-
- if (origmask & ACE_APPEND_DATA) {
- if (mode & wmask) {
- acepmask &= ~ACE_APPEND_DATA;
- } else {
- acepmask |= ACE_APPEND_DATA;
- }
- }
-
- if (origmask & ACE_EXECUTE) {
- if (mode & xmask) {
- acepmask &= ~ACE_EXECUTE;
- } else {
- acepmask |= ACE_EXECUTE;
- }
- }
- aclp->z_ops.ace_mask_set(acep, acepmask);
-}
-
-/*
- * Apply mode to canonical six ACEs.
- */
-static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
- zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
- void *acep;
- int maskoff = aclp->z_ops.ace_mask_off();
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
- ASSERT(aclnode != NULL);
-
- acep = (void *)((caddr_t)aclnode->z_acldata +
- aclnode->z_size - (abstract_size * 6));
-
- /*
- * Fixup final ACEs to match the mode
- */
-
- adjust_ace_pair_common(acep, maskoff, abstract_size,
- (mode & 0700) >> 6); /* owner@ */
-
- acep = (caddr_t)acep + (abstract_size * 2);
-
- adjust_ace_pair_common(acep, maskoff, abstract_size,
- (mode & 0070) >> 3); /* group@ */
-
- acep = (caddr_t)acep + (abstract_size * 2);
- adjust_ace_pair_common(acep, maskoff,
- abstract_size, mode); /* everyone@ */
-}
-
-
-static int
-zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny,
- int entry_type, int accessmask)
-{
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
- uint16_t type = aclp->z_ops.ace_type_get(acep);
- uint16_t flags = aclp->z_ops.ace_flags_get(acep);
-
- return (mask == accessmask && type == allow_deny &&
- ((flags & ACE_TYPE_FLAGS) == entry_type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep)
-{
- int okay_masks;
- uint16_t prevtype;
- uint16_t prevflags;
- uint16_t flags;
- uint32_t mask, prevmask;
-
- if (prevacep == NULL)
- return (B_FALSE);
-
- prevtype = aclp->z_ops.ace_type_get(prevacep);
- prevflags = aclp->z_ops.ace_flags_get(prevacep);
- flags = aclp->z_ops.ace_flags_get(acep);
- mask = aclp->z_ops.ace_mask_get(acep);
- prevmask = aclp->z_ops.ace_mask_get(prevacep);
-
- if (prevtype != DENY)
- return (B_FALSE);
-
- if (prevflags != (flags & ACE_IDENTIFIER_GROUP))
- return (B_FALSE);
-
- okay_masks = (mask & OKAY_MASK_BITS);
-
- if (prevmask & ~okay_masks)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-
-/*
- * Insert new ACL node into chain of zfs_acl_node_t's
- *
- * This will result in two possible results.
- * 1. If the ACL is currently just a single zfs_acl_node and
- * we are prepending the entry then current acl node will have
- * a new node inserted above it.
- *
- * 2. If we are inserting in the middle of current acl node then
- * the current node will be split in two and new node will be inserted
- * in between the two split nodes.
- */
-static zfs_acl_node_t *
-zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep)
-{
- zfs_acl_node_t *newnode;
- zfs_acl_node_t *trailernode = NULL;
- zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp);
- int curr_idx = aclp->z_curr_node->z_ace_idx;
- int trailer_count;
- size_t oldsize;
-
- newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep));
- newnode->z_ace_count = 1;
-
- oldsize = currnode->z_size;
-
- if (curr_idx != 1) {
- trailernode = zfs_acl_node_alloc(0);
- trailernode->z_acldata = acep;
-
- trailer_count = currnode->z_ace_count - curr_idx + 1;
- currnode->z_ace_count = curr_idx - 1;
- currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata;
- trailernode->z_size = oldsize - currnode->z_size;
- trailernode->z_ace_count = trailer_count;
- }
-
- aclp->z_acl_count += 1;
- aclp->z_acl_bytes += aclp->z_ops.ace_size(acep);
-
- if (curr_idx == 1)
- list_insert_before(&aclp->z_acl, currnode, newnode);
- else
- list_insert_after(&aclp->z_acl, currnode, newnode);
- if (trailernode) {
- list_insert_after(&aclp->z_acl, newnode, trailernode);
- aclp->z_curr_node = trailernode;
- trailernode->z_ace_idx = 1;
- }
-
- return (newnode);
-}
-
-/*
- * Prepend deny ACE
- */
-static void *
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
- mode_t mode)
-{
- zfs_acl_node_t *aclnode;
- void *newacep;
- uint64_t fuid;
- uint16_t flags;
-
- aclnode = zfs_acl_ace_insert(aclp, acep);
- newacep = aclnode->z_acldata;
- fuid = aclp->z_ops.ace_who_get(acep);
- flags = aclp->z_ops.ace_flags_get(acep);
- zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
- zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid);
-
- return (newacep);
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
-static void
-zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep)
-{
- zfs_acl_node_t *aclnode;
- zfs_acl_node_t *currnode;
- void *newacep;
- uint16_t type, flags;
- uint32_t mask;
- uint64_t fuid;
-
- type = aclp->z_ops.ace_type_get(acep);
- flags = aclp->z_ops.ace_flags_get(acep);
- mask = aclp->z_ops.ace_mask_get(acep);
- fuid = aclp->z_ops.ace_who_get(acep);
-
- aclnode = zfs_acl_ace_insert(aclp, acep);
- newacep = aclnode->z_acldata;
-
- aclp->z_ops.ace_type_set(newacep, type);
- aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE);
- aclp->z_ops.ace_mask_set(newacep, mask);
- aclp->z_ops.ace_type_set(newacep, type);
- aclp->z_ops.ace_who_set(newacep, fuid);
- aclp->z_next_ace = acep;
- flags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep, flags);
- currnode = zfs_acl_curr_node(aclp);
- ASSERT(currnode->z_ace_idx >= 1);
- currnode->z_ace_idx -= 1;
-}
-
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp)
-{
- void *acep;
- zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
- int i = 0;
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
- ASSERT(aclnode != NULL);
-
- if (aclnode->z_ace_count < 6)
- return (0);
-
- acep = (void *)((caddr_t)aclnode->z_acldata +
- aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6));
-
- if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- DENY, ACE_OWNER, 0) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY,
- OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep +
- (abstract_size * i++),
- ALLOW, OWNING_GROUP, 0) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) {
- return (1);
- } else {
- return (0);
- }
-}
-
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner. If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
- mode_t mode)
-{
- uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep);
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
- uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep);
- mode_t extramode = (mode >> 3) & 07;
- mode_t ownermode = (mode >> 6);
-
- if (prevflags & ACE_IDENTIFIER_GROUP) {
-
- extramode &= ~ownermode;
-
- if (extramode) {
- if (extramode & S_IROTH) {
- prevmask &= ~ACE_READ_DATA;
- mask &= ~ACE_READ_DATA;
- }
- if (extramode & S_IWOTH) {
- prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- }
- if (extramode & S_IXOTH) {
- prevmask &= ~ACE_EXECUTE;
- mask &= ~ACE_EXECUTE;
- }
- }
- }
- aclp->z_ops.ace_mask_set(acep, mask);
- aclp->z_ops.ace_mask_set(prevacep, prevmask);
-}
-
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
-static void
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- void *acep = NULL, *prevacep = NULL;
- uint64_t who;
- int i;
- int entry_type;
- int reuse_deny;
- int need_canonical_six = 1;
- uint16_t iflags, type;
- uint32_t access_mask;
-
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
- aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
-
- /*
- * If discard then just discard all ACL nodes which
- * represent the ACEs.
- *
- * New owner@/group@/everone@ ACEs will be added
- * later.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
- zfs_acl_release_nodes(aclp);
-
- while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
- &iflags, &type)) {
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
- iflags = (iflags & ALL_INHERIT);
-
- if ((type != ALLOW && type != DENY) ||
- (iflags & ACE_INHERIT_ONLY_ACE)) {
- if (iflags)
- aclp->z_hints |= ZFS_INHERIT_ACE;
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- aclp->z_hints |= ZFS_ACL_OBJ_ACE;
- break;
- }
- goto nextace;
- }
-
- /*
- * Need to split ace into two?
- */
- if ((iflags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) &&
- (!(iflags & ACE_INHERIT_ONLY_ACE))) {
- zfs_acl_split_ace(aclp, acep);
- aclp->z_hints |= ZFS_INHERIT_ACE;
- goto nextace;
- }
-
- if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
- (entry_type == OWNING_GROUP)) {
- access_mask &= ~OGE_CLEAR;
- aclp->z_ops.ace_mask_set(acep, access_mask);
- goto nextace;
- } else {
- reuse_deny = B_TRUE;
- if (type == ALLOW) {
-
- /*
- * Check preceding ACE if any, to see
- * if we need to prepend a DENY ACE.
- * This is only applicable when the acl_mode
- * property == groupmask.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
- reuse_deny = zfs_reuse_deny(aclp, acep,
- prevacep);
-
- if (!reuse_deny) {
- prevacep =
- zfs_acl_prepend_deny(zp,
- aclp, acep, mode);
- } else {
- zfs_acl_prepend_fixup(
- aclp, prevacep,
- acep, mode,
- zp->z_phys->zp_uid);
- }
- zfs_fixup_group_entries(aclp, acep,
- prevacep, mode);
-
- }
- }
- }
-nextace:
- prevacep = acep;
- }
-
- /*
- * Check out last six aces, if we have six.
- */
-
- if (aclp->z_acl_count >= 6) {
- if (zfs_have_canonical_six(aclp)) {
- need_canonical_six = 0;
- }
- }
-
- if (need_canonical_six) {
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
- void *zacep;
- zfs_acl_node_t *aclnode =
- zfs_acl_node_alloc(abstract_size * 6);
-
- aclnode->z_size = abstract_size * 6;
- aclnode->z_ace_count = 6;
- aclp->z_acl_bytes += aclnode->z_size;
- list_insert_tail(&aclp->z_acl, aclnode);
-
- zacep = aclnode->z_acldata;
-
- i = 0;
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- 0, DENY, -1, ACE_OWNER);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
- DENY, -1, OWNING_GROUP);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
- ALLOW, -1, OWNING_GROUP);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE);
- aclp->z_acl_count += 6;
- }
-
- zfs_acl_fixup_canonical_six(aclp, mode);
-}
-
-int
-zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
-{
- int error;
-
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
- *aclp = NULL;
- error = zfs_acl_node_read(zp, aclp, B_TRUE);
- if (error == 0)
- zfs_acl_chmod(zp, mode, *aclp);
- mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
- return (error);
-}
-
-/*
- * strip off write_owner and write_acl
- */
-static void
-zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
-{
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
-
- if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) &&
- (aclp->z_ops.ace_type_get(acep) == ALLOW)) {
- mask &= ~RESTRICTED_CLEAR;
- aclp->z_ops.ace_mask_set(acep, mask);
- }
-}
-
-/*
- * Should ACE be inherited?
- */
-static int
-zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
-{
- int vtype = ZTOV(zp)->v_type;
- int iflags = (acep_flags & 0xf);
-
- if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
- return (1);
- else if (iflags & ACE_FILE_INHERIT_ACE)
- return (!((vtype == VDIR) &&
- (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
- return (0);
-}
-
-/*
- * inherit inheritable ACEs from parent
- */
-static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, boolean_t *need_chmod)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- void *pacep;
- void *acep, *acep2;
- zfs_acl_node_t *aclnode, *aclnode2;
- zfs_acl_t *aclp = NULL;
- uint64_t who;
- uint32_t access_mask;
- uint16_t iflags, newflags, type;
- size_t ace_size;
- void *data1, *data2;
- size_t data1sz, data2sz;
- enum vtype vntype = ZTOV(zp)->v_type;
-
- *need_chmod = B_TRUE;
- pacep = NULL;
- aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
- if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
- while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
- &access_mask, &iflags, &type)) {
-
- if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
- type == ALLOW)
- continue;
-
- ace_size = aclp->z_ops.ace_size(pacep);
-
- if (!zfs_ace_can_use(zp, iflags))
- continue;
-
- /*
- * If owner@, group@, or everyone@ inheritable
- * then zfs_acl_chmod() isn't needed.
- */
- if (zfsvfs->z_acl_inherit ==
- ZFS_ACL_PASSTHROUGH &&
- ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
- ((iflags & OWNING_GROUP) ==
- OWNING_GROUP)) && (vntype == VREG ||
- (vntype == VDIR &&
- (iflags & ACE_DIRECTORY_INHERIT_ACE))))
- *need_chmod = B_FALSE;
-
- aclnode = zfs_acl_node_alloc(ace_size);
- list_insert_tail(&aclp->z_acl, aclnode);
- acep = aclnode->z_acldata;
- zfs_set_ace(aclp, acep, access_mask, type,
- who, iflags|ACE_INHERITED_ACE);
-
- /*
- * Copy special opaque data if any
- */
- if ((data1sz = paclp->z_ops.ace_data(pacep,
- &data1)) != 0) {
- VERIFY((data2sz = aclp->z_ops.ace_data(acep,
- &data2)) == data1sz);
- bcopy(data1, data2, data2sz);
- }
- aclp->z_acl_count++;
- aclnode->z_ace_count++;
- aclp->z_acl_bytes += aclnode->z_size;
- newflags = aclp->z_ops.ace_flags_get(acep);
-
- if (vntype == VDIR)
- aclp->z_hints |= ZFS_INHERIT_ACE;
-
- if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) ||
- (vntype != VDIR)) {
- newflags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep,
- newflags|ACE_INHERITED_ACE);
- zfs_restricted_update(zfsvfs, aclp, acep);
- continue;
- }
-
- ASSERT(vntype == VDIR);
-
- newflags = aclp->z_ops.ace_flags_get(acep);
- if ((iflags & (ACE_FILE_INHERIT_ACE |
- ACE_DIRECTORY_INHERIT_ACE)) !=
- ACE_FILE_INHERIT_ACE) {
- aclnode2 = zfs_acl_node_alloc(ace_size);
- list_insert_tail(&aclp->z_acl, aclnode2);
- acep2 = aclnode2->z_acldata;
- zfs_set_ace(aclp, acep2,
- access_mask, type, who,
- iflags|ACE_INHERITED_ACE);
- newflags |= ACE_INHERIT_ONLY_ACE;
- aclp->z_ops.ace_flags_set(acep, newflags);
- newflags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep2,
- newflags|ACE_INHERITED_ACE);
-
- /*
- * Copy special opaque data if any
- */
- if ((data1sz = aclp->z_ops.ace_data(acep,
- &data1)) != 0) {
- VERIFY((data2sz =
- aclp->z_ops.ace_data(acep2,
- &data2)) == data1sz);
- bcopy(data1, data2, data1sz);
- }
- aclp->z_acl_count++;
- aclnode2->z_ace_count++;
- aclp->z_acl_bytes += aclnode->z_size;
- zfs_restricted_update(zfsvfs, aclp, acep2);
- } else {
- newflags |= ACE_INHERIT_ONLY_ACE;
- aclp->z_ops.ace_flags_set(acep,
- newflags|ACE_INHERITED_ACE);
- }
- }
- }
- return (aclp);
-}
-
-/*
- * Create file system object initial permissions
- * including inheritable ACEs.
- */
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
- vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
-{
- uint64_t mode, fuid, fgid;
- int error;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zfs_acl_t *aclp = NULL;
- zfs_acl_t *paclp;
- xvattr_t *xvap = (xvattr_t *)vap;
- gid_t gid;
- boolean_t need_chmod = B_TRUE;
-
- if (setaclp)
- aclp = setaclp;
-
- mode = MAKEIMODE(vap->va_type, vap->va_mode);
-
- /*
- * Determine uid and gid.
- */
- if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
- ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
- fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr,
- ZFS_OWNER, tx, fuidp);
- fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
- ZFS_GROUP, tx, fuidp);
- gid = vap->va_gid;
- } else {
- fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp);
- fgid = 0;
- if (vap->va_mask & AT_GID) {
- fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
- ZFS_GROUP, tx, fuidp);
- gid = vap->va_gid;
- if (fgid != parent->z_phys->zp_gid &&
- !groupmember(vap->va_gid, cr) &&
- secpolicy_vnode_create_gid(cr) != 0)
- fgid = 0;
- }
- if (fgid == 0) {
- if (parent->z_phys->zp_mode & S_ISGID) {
- fgid = parent->z_phys->zp_gid;
- gid = zfs_fuid_map_id(zfsvfs, fgid,
- cr, ZFS_GROUP);
- } else {
- fgid = zfs_fuid_create_cred(zfsvfs,
- ZFS_GROUP, tx, cr, fuidp);
- gid = crgetgid(cr);
- }
- }
- }
-
- /*
- * If we're creating a directory, and the parent directory has the
- * set-GID bit set, set in on the new directory.
- * Otherwise, if the user is neither privileged nor a member of the
- * file's new group, clear the file's set-GID bit.
- */
-
- if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) {
- mode |= S_ISGID;
- } else {
- if ((mode & S_ISGID) &&
- secpolicy_vnode_setids_setgids(cr, gid) != 0)
- mode &= ~S_ISGID;
- }
-
- zp->z_phys->zp_uid = fuid;
- zp->z_phys->zp_gid = fgid;
- zp->z_phys->zp_mode = mode;
-
- if (aclp == NULL) {
- mutex_enter(&parent->z_lock);
- if (parent->z_phys->zp_flags & ZFS_INHERIT_ACE) {
- mutex_enter(&parent->z_acl_lock);
- VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE));
- mutex_exit(&parent->z_acl_lock);
- aclp = zfs_acl_inherit(zp, paclp, &need_chmod);
- zfs_acl_free(paclp);
- } else {
- aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
- }
- mutex_exit(&parent->z_lock);
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
- if (need_chmod)
- zfs_acl_chmod(zp, mode, aclp);
- } else {
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
- }
-
- /* Force auto_inherit on all new directory objects */
- if (vap->va_type == VDIR)
- aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
-
- error = zfs_aclset_common(zp, aclp, cr, fuidp, tx);
-
- /* Set optional attributes if any */
- if (vap->va_mask & AT_XVATTR)
- zfs_xvattr_set(zp, xvap);
-
- mutex_exit(&zp->z_lock);
- mutex_exit(&zp->z_acl_lock);
- ASSERT3U(error, ==, 0);
-
- if (aclp != setaclp)
- zfs_acl_free(aclp);
-}
-
-/*
- * Retrieve a files ACL
- */
-int
-zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
- zfs_acl_t *aclp;
- ulong_t mask;
- int error;
- int count = 0;
- int largeace = 0;
-
- mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
- VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
-
- if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
- return (error);
-
- if (mask == 0)
- return (ENOSYS);
-
- mutex_enter(&zp->z_acl_lock);
-
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
- if (error != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
- /*
- * Scan ACL to determine number of ACEs
- */
- if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
- !(mask & VSA_ACE_ALLTYPES)) {
- void *zacep = NULL;
- uint64_t who;
- uint32_t access_mask;
- uint16_t type, iflags;
-
- while (zacep = zfs_acl_next_ace(aclp, zacep,
- &who, &access_mask, &iflags, &type)) {
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- largeace++;
- continue;
- default:
- count++;
- }
- }
- vsecp->vsa_aclcnt = count;
- } else
- count = aclp->z_acl_count;
-
- if (mask & VSA_ACECNT) {
- vsecp->vsa_aclcnt = count;
- }
-
- if (mask & VSA_ACE) {
- size_t aclsz;
-
- zfs_acl_node_t *aclnode = list_head(&aclp->z_acl);
-
- aclsz = count * sizeof (ace_t) +
- sizeof (ace_object_t) * largeace;
-
- vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
- vsecp->vsa_aclentsz = aclsz;
-
- if (aclp->z_version == ZFS_ACL_VERSION_FUID)
- zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
- vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
- else {
- bcopy(aclnode->z_acldata, vsecp->vsa_aclentp,
- count * sizeof (ace_t));
- }
- }
- if (mask & VSA_ACE_ACLFLAGS) {
- vsecp->vsa_aclflags = 0;
- if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
- vsecp->vsa_aclflags |= ACL_DEFAULTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
- vsecp->vsa_aclflags |= ACL_PROTECTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
- vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
- }
-
- mutex_exit(&zp->z_acl_lock);
-
- zfs_acl_free(aclp);
-
- return (0);
-}
-
-int
-zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
- vsecattr_t *vsecp, zfs_acl_t **zaclp)
-{
- zfs_acl_t *aclp;
- zfs_acl_node_t *aclnode;
- int aclcnt = vsecp->vsa_aclcnt;
- int error;
-
- if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
- return (EINVAL);
-
- aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
-
- aclp->z_hints = 0;
- aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
- (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
- aclcnt, &aclnode->z_size)) != 0) {
- zfs_acl_free(aclp);
- zfs_acl_node_free(aclnode);
- return (error);
- }
- } else {
- if ((error = zfs_copy_ace_2_fuid(obj_type, aclp,
- vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
- &aclnode->z_size)) != 0) {
- zfs_acl_free(aclp);
- zfs_acl_node_free(aclnode);
- return (error);
- }
- }
- aclp->z_acl_bytes = aclnode->z_size;
- aclnode->z_ace_count = aclcnt;
- aclp->z_acl_count = aclcnt;
- list_insert_head(&aclp->z_acl, aclnode);
-
- /*
- * If flags are being set then add them to z_hints
- */
- if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
- if (vsecp->vsa_aclflags & ACL_PROTECTED)
- aclp->z_hints |= ZFS_ACL_PROTECTED;
- if (vsecp->vsa_aclflags & ACL_DEFAULTED)
- aclp->z_hints |= ZFS_ACL_DEFAULTED;
- if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
- aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
- }
-
- *zaclp = aclp;
-
- return (0);
-}
-
-/*
- * Set a files ACL
- */
-int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
- dmu_tx_t *tx;
- int error;
- zfs_acl_t *aclp;
- zfs_fuid_info_t *fuidp = NULL;
-
- if (mask == 0)
- return (ENOSYS);
-
- if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
- return (EPERM);
-
- if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
- return (error);
-
- error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp);
- if (error)
- return (error);
-
- /*
- * If ACL wide flags aren't being set then preserve any
- * existing flags.
- */
- if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
- aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
- }
-top:
- if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
- zfs_acl_free(aclp);
- return (error);
- }
-
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL? */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- zp->z_phys->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, aclp->z_acl_bytes);
- }
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
- }
- if (aclp->z_has_fuids) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
-
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
-
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- zfs_acl_free(aclp);
- return (error);
- }
-
- error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
- ASSERT(error == 0);
-
- zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
-
- if (fuidp)
- zfs_fuid_info_free(fuidp);
- zfs_acl_free(aclp);
- dmu_tx_commit(tx);
-done:
- mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
-
- return (error);
-}
-
-/*
- * working_mode returns the permissions that were not granted
- */
-static int
-zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
- boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
-{
- zfs_acl_t *aclp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
- uid_t uid = crgetuid(cr);
- uint64_t who;
- uint16_t type, iflags;
- uint16_t entry_type;
- uint32_t access_mask;
- uint32_t deny_mask = 0;
- zfs_ace_hdr_t *acep = NULL;
- boolean_t checkit;
- uid_t fowner;
- uid_t gowner;
-
- /*
- * Short circuit empty requests
- */
- if (v4_mode == 0)
- return (0);
-
- *check_privs = B_TRUE;
-
- if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
- *working_mode = 0;
- return (0);
- }
-
- *working_mode = v4_mode;
-
- if ((v4_mode & WRITE_MASK) &&
- (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
- (!IS_DEVVP(ZTOV(zp)))) {
- *check_privs = B_FALSE;
- return (EROFS);
- }
-
- /*
- * Only check for READONLY on non-directories.
- */
- if ((v4_mode & WRITE_MASK_DATA) &&
- (((ZTOV(zp)->v_type != VDIR) &&
- (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
- (ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
- *check_privs = B_FALSE;
- return (EPERM);
- }
-
- if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
- (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
- *check_privs = B_FALSE;
- return (EPERM);
- }
-
- if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
- (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
- *check_privs = B_FALSE;
- return (EACCES);
- }
-
- /*
- * The caller requested that the ACL check be skipped. This
- * would only happen if the caller checked VOP_ACCESS() with a
- * 32 bit ACE mask and already had the appropriate permissions.
- */
- if (skipaclchk) {
- *working_mode = 0;
- return (0);
- }
-
- zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
-
- mutex_enter(&zp->z_acl_lock);
-
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
- if (error != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
- while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
- &iflags, &type)) {
-
- if (iflags & ACE_INHERIT_ONLY_ACE)
- continue;
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
-
- checkit = B_FALSE;
-
- switch (entry_type) {
- case ACE_OWNER:
- if (uid == fowner)
- checkit = B_TRUE;
- break;
- case OWNING_GROUP:
- who = gowner;
- /*FALLTHROUGH*/
- case ACE_IDENTIFIER_GROUP:
- checkit = zfs_groupmember(zfsvfs, who, cr);
- break;
- case ACE_EVERYONE:
- checkit = B_TRUE;
- break;
-
- /* USER Entry */
- default:
- if (entry_type == 0) {
- uid_t newid;
-
- newid = zfs_fuid_map_id(zfsvfs, who, cr,
- ZFS_ACE_USER);
- if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
- uid == newid)
- checkit = B_TRUE;
- break;
- } else {
- zfs_acl_free(aclp);
- mutex_exit(&zp->z_acl_lock);
- return (EIO);
- }
- }
-
- if (checkit) {
- uint32_t mask_matched = (access_mask & *working_mode);
-
- if (mask_matched) {
- if (type == DENY)
- deny_mask |= mask_matched;
-
- *working_mode &= ~mask_matched;
- }
- }
-
- /* Are we done? */
- if (*working_mode == 0)
- break;
- }
-
- mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
-
- /* Put the found 'denies' back on the working mode */
- *working_mode |= deny_mask;
-
- if (*working_mode)
- return (EACCES);
-
- return (0);
-}
-
-static int
-zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
- cred_t *cr)
-{
- if (*working_mode != ACE_WRITE_DATA)
- return (EACCES);
-
- return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
- check_privs, B_FALSE, cr));
-}
-
-/*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
- */
-int
-zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
-{
- uint32_t working_mode;
- int error;
- int is_attr;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- boolean_t check_privs;
- znode_t *xzp;
- znode_t *check_zp = zp;
-
- is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
- (ZTOV(zp)->v_type == VDIR));
-
- /*
- * If attribute then validate against base file
- */
- if (is_attr) {
- if ((error = zfs_zget(zp->z_zfsvfs,
- zp->z_phys->zp_parent, &xzp)) != 0) {
- return (error);
- }
-
- check_zp = xzp;
-
- /*
- * fixup mode to map to xattr perms
- */
-
- if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
- mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- mode |= ACE_WRITE_NAMED_ATTRS;
- }
-
- if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
- mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
- mode |= ACE_READ_NAMED_ATTRS;
- }
- }
-
- if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
- &check_privs, skipaclchk, cr)) == 0) {
- if (is_attr)
- VN_RELE(ZTOV(xzp));
- return (0);
- }
-
- if (error && !check_privs) {
- if (is_attr)
- VN_RELE(ZTOV(xzp));
- return (error);
- }
-
- if (error && (flags & V_APPEND)) {
- error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
- }
-
- if (error && check_privs) {
- uid_t owner;
- mode_t checkmode = 0;
-
- owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
- ZFS_OWNER);
-
- /*
- * First check for implicit owner permission on
- * read_acl/read_attributes
- */
-
- error = 0;
- ASSERT(working_mode != 0);
-
- if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
- owner == crgetuid(cr)))
- working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
-
- if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
- ACE_READ_ACL|ACE_READ_ATTRIBUTES))
- checkmode |= VREAD;
- if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
- ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES))
- checkmode |= VWRITE;
- if (working_mode & ACE_EXECUTE)
- checkmode |= VEXEC;
-
- if (checkmode)
- error = secpolicy_vnode_access(cr, ZTOV(check_zp),
- owner, checkmode);
-
- if (error == 0 && (working_mode & ACE_WRITE_OWNER))
- error = secpolicy_vnode_create_gid(cr);
- if (error == 0 && (working_mode & ACE_WRITE_ACL))
- error = secpolicy_vnode_setdac(cr, owner);
-
- if (error == 0 && (working_mode &
- (ACE_DELETE|ACE_DELETE_CHILD)))
- error = secpolicy_vnode_remove(cr);
-
- if (error == 0 && (working_mode & ACE_SYNCHRONIZE))
- error = secpolicy_vnode_owner(cr, owner);
-
- if (error == 0) {
- /*
- * See if any bits other than those already checked
- * for are still present. If so then return EACCES
- */
- if (working_mode & ~(ZFS_CHECKED_MASKS)) {
- error = EACCES;
- }
- }
- }
-
- if (is_attr)
- VN_RELE(ZTOV(xzp));
-
- return (error);
-}
-
-/*
- * Translate traditional unix VREAD/VWRITE/VEXEC mode into
- * native ACL format and call zfs_zaccess()
- */
-int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
-{
- return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
-}
-
-/*
- * Access function for secpolicy_vnode_setattr
- */
-int
-zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
-{
- int v4_mode = zfs_unix_to_v4(mode >> 6);
-
- return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
-}
-
-static int
-zfs_delete_final_check(znode_t *zp, znode_t *dzp,
- mode_t missing_perms, cred_t *cr)
-{
- int error;
- uid_t downer;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
-
- error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
-
- if (error == 0)
- error = zfs_sticky_remove_access(dzp, zp, cr);
-
- return (error);
-}
-
-/*
- * Determine whether Access should be granted/deny, without
- * consulting least priv subsystem.
- *
- *
- * The following chart is the recommended NFSv4 enforcement for
- * ability to delete an object.
- *
- * -------------------------------------------------------
- * | Parent Dir | Target Object Permissions |
- * | permissions | |
- * -------------------------------------------------------
- * | | ACL Allows | ACL Denies| Delete |
- * | | Delete | Delete | unspecified|
- * -------------------------------------------------------
- * | ACL Allows | Permit | Permit | Permit |
- * | DELETE_CHILD | |
- * -------------------------------------------------------
- * | ACL Denies | Permit | Deny | Deny |
- * | DELETE_CHILD | | | |
- * -------------------------------------------------------
- * | ACL specifies | | | |
- * | only allow | Permit | Permit | Permit |
- * | write and | | | |
- * | execute | | | |
- * -------------------------------------------------------
- * | ACL denies | | | |
- * | write and | Permit | Deny | Deny |
- * | execute | | | |
- * -------------------------------------------------------
- * ^
- * |
- * No search privilege, can't even look up file?
- *
- */
-int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
-{
- uint32_t dzp_working_mode = 0;
- uint32_t zp_working_mode = 0;
- int dzp_error, zp_error;
- mode_t missing_perms;
- boolean_t dzpcheck_privs = B_TRUE;
- boolean_t zpcheck_privs = B_TRUE;
-
- /*
- * We want specific DELETE permissions to
- * take precedence over WRITE/EXECUTE. We don't
- * want an ACL such as this to mess us up.
- * user:joe:write_data:deny,user:joe:delete:allow
- *
- * However, deny permissions may ultimately be overridden
- * by secpolicy_vnode_access().
- *
- * We will ask for all of the necessary permissions and then
- * look at the working modes from the directory and target object
- * to determine what was found.
- */
-
- if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
- return (EPERM);
-
- /*
- * If the directory permissions allow the delete, we are done.
- */
- if ((dzp_error = zfs_zaccess_common(dzp,
- ACE_DELETE_CHILD|ACE_EXECUTE|ACE_WRITE_DATA,
- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
- return (0);
-
- /*
- * If target object has delete permission then we are done
- */
- if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
- &zpcheck_privs, B_FALSE, cr)) == 0)
- return (0);
-
- if (!dzpcheck_privs)
- return (dzp_error);
- else if (!zpcheck_privs)
- return (zp_error);
-
- /*
- * First check the first row.
- * We only need to see if parent Allows delete_child
- */
- if ((dzp_working_mode & ACE_DELETE_CHILD) == 0)
- return (0);
-
- /*
- * Second row
- * we already have the necessary information in
- * zp_working_mode, zp_error and dzp_error.
- */
-
- if ((zp_working_mode & ACE_DELETE) == 0)
- return (0);
-
- /*
- * determine the needed permissions based off of the directories
- * working mode
- */
-
- missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0;
- missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0;
-
- if (dzp_error == EACCES)
- return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
-
- /*
- * Third Row
- * only need to see if we have write/execute on directory.
- */
-
- if (missing_perms == 0)
- return (zfs_sticky_remove_access(dzp, zp, cr));
-
- /*
- * Fourth Row
- */
-
- if (missing_perms && ((zp_working_mode & ACE_DELETE) == 0))
- return (zfs_sticky_remove_access(dzp, zp, cr));
-
- return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
-}
-
-int
-zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
- znode_t *tzp, cred_t *cr)
-{
- int add_perm;
- int error;
-
- if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
- return (EACCES);
-
- add_perm = (ZTOV(szp)->v_type == VDIR) ?
- ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
-
- /*
- * Rename permissions are combination of delete permission +
- * add file/subdir permission.
- */
-
- /*
- * first make sure we do the delete portion.
- *
- * If that succeeds then check for add_file/add_subdir permissions
- */
-
- if (error = zfs_zaccess_delete(sdzp, szp, cr))
- return (error);
-
- /*
- * If we have a tzp, see if we can delete it?
- */
- if (tzp) {
- if (error = zfs_zaccess_delete(tdzp, tzp, cr))
- return (error);
- }
-
- /*
- * Now check for add permissions
- */
- error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
-
- return (error);
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_ctldir.c b/zfs/lib/libdmu-ctl/zfs_ctldir.c
deleted file mode 100644
index 45de481c9..000000000
--- a/zfs/lib/libdmu-ctl/zfs_ctldir.c
+++ /dev/null
@@ -1,1147 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_ctldir.c 1.20 08/04/27 SMI"
-
-/*
- * ZFS control directory (a.k.a. ".zfs")
- *
- * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future. The elements are built using the GFS primitives, as the hierarchy
- * does not actually exist on disk.
- *
- * For 'snapshot', we don't want to have all snapshots always mounted, because
- * this would take up a huge amount of space in /etc/mnttab. We have three
- * types of objects:
- *
- * ctldir ------> snapshotdir -------> snapshot
- * |
- * |
- * V
- * mounted fs
- *
- * The 'snapshot' node contains just enough information to lookup '..' and act
- * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
- * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
- *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land. The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
- *
- * The '.zfs', '.zfs/snapshot', and all directories created under
- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
- * share the same vfs_t as the head filesystem (what '.zfs' lives under).
- *
- * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
- * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
- * However, vnodes within these mounted on file systems have their v_vfsp
- * fields set to the head filesystem to make NFS happy (see
- * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
- * so that it cannot be freed until all snapshots have been unmounted.
- */
-
-#include <fs/fs_subr.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/vfs_opreg.h>
-#include <sys/gfs.h>
-#include <sys/stat.h>
-#include <sys/dmu.h>
-#include <sys/dsl_deleg.h>
-#include <sys/mount.h>
-#include <sys/sunddi.h>
-
-typedef struct zfsctl_node {
- gfs_dir_t zc_gfs_private;
- uint64_t zc_id;
- timestruc_t zc_cmtime; /* ctime and mtime, always the same */
-} zfsctl_node_t;
-
-typedef struct zfsctl_snapdir {
- zfsctl_node_t sd_node;
- kmutex_t sd_lock;
- avl_tree_t sd_snaps;
-} zfsctl_snapdir_t;
-
-typedef struct {
- char *se_name;
- vnode_t *se_root;
- avl_node_t se_node;
-} zfs_snapentry_t;
-
-static int
-snapentry_compare(const void *a, const void *b)
-{
- const zfs_snapentry_t *sa = a;
- const zfs_snapentry_t *sb = b;
- int ret = strcmp(sa->se_name, sb->se_name);
-
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
-}
-
-vnodeops_t *zfsctl_ops_root;
-vnodeops_t *zfsctl_ops_snapdir;
-vnodeops_t *zfsctl_ops_snapshot;
-
-static const fs_operation_def_t zfsctl_tops_root[];
-static const fs_operation_def_t zfsctl_tops_snapdir[];
-static const fs_operation_def_t zfsctl_tops_snapshot[];
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
-
-static gfs_opsvec_t zfsctl_opsvec[] = {
- { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
- { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
- { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
- { NULL }
-};
-
-/*
- * Root directory elements. We have only a single static entry, 'snapshot'.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
- { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
- { NULL }
-};
-
-/* include . and .. in the calculation */
-#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \
- sizeof (gfs_dirent_t)) + 1)
-
-
-/*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories. This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
- */
-void
-zfsctl_init(void)
-{
- VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
-}
-
-void
-zfsctl_fini(void)
-{
- /*
- * Remove vfsctl vnode ops
- */
- if (zfsctl_ops_root)
- vn_freevnodeops(zfsctl_ops_root);
- if (zfsctl_ops_snapdir)
- vn_freevnodeops(zfsctl_ops_snapdir);
- if (zfsctl_ops_snapshot)
- vn_freevnodeops(zfsctl_ops_snapshot);
-
- zfsctl_ops_root = NULL;
- zfsctl_ops_snapdir = NULL;
- zfsctl_ops_snapshot = NULL;
-}
-
-/*
- * Return the inode number associated with the 'snapshot' directory.
- */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
-{
- ASSERT(index == 0);
- return (ZFSCTL_INO_SNAPDIR);
-}
-
-/*
- * Create the '.zfs' directory. This directory is cached as part of the VFS
- * structure. This results in a hold on the vfs_t. The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1. This reference
- * is removed when the ctldir is destroyed in the unmount.
- */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
-{
- vnode_t *vp, *rvp;
- zfsctl_node_t *zcp;
-
- ASSERT(zfsvfs->z_ctldir == NULL);
-
- vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
- zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
- zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
- zcp = vp->v_data;
- zcp->zc_id = ZFSCTL_INO_ROOT;
-
- VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
- ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
- VN_RELE(rvp);
-
- /*
- * We're only faking the fact that we have a root of a filesystem for
- * the sake of the GFS interfaces. Undo the flag manipulation it did
- * for us.
- */
- vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
-
- zfsvfs->z_ctldir = vp;
-}
-
-/*
- * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
- */
-void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
-{
- VN_RELE(zfsvfs->z_ctldir);
- zfsvfs->z_ctldir = NULL;
-}
-
-/*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
- */
-vnode_t *
-zfsctl_root(znode_t *zp)
-{
- ASSERT(zfs_has_ctldir(zp));
- VN_HOLD(zp->z_zfsvfs->z_ctldir);
- return (zp->z_zfsvfs->z_ctldir);
-}
-
-/*
- * Common open routine. Disallow any write access.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
-{
- if (flags & FWRITE)
- return (EACCES);
-
- return (0);
-}
-
-/*
- * Common close routine. Nothing to do here.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
- cred_t *cr, caller_context_t *ct)
-{
- return (0);
-}
-
-/*
- * Common access routine. Disallow writes.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- if (mode & VWRITE)
- return (EACCES);
-
- return (0);
-}
-
-/*
- * Common getattr function. Fill in basic information.
- */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
-{
- zfsctl_node_t *zcp = vp->v_data;
- timestruc_t now;
-
- vap->va_uid = 0;
- vap->va_gid = 0;
- vap->va_rdev = 0;
- /*
- * We are a purly virtual object, so we have no
- * blocksize or allocated blocks.
- */
- vap->va_blksize = 0;
- vap->va_nblocks = 0;
- vap->va_seq = 0;
- vap->va_fsid = vp->v_vfsp->vfs_dev;
- vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
- S_IROTH | S_IXOTH;
- vap->va_type = VDIR;
- /*
- * We live in the now (for atime).
- */
- gethrestime(&now);
- vap->va_atime = now;
- vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
-}
-
-/*ARGSUSED*/
-static int
-zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_node_t *zcp = vp->v_data;
- uint64_t object = zcp->zc_id;
- zfid_short_t *zfid;
- int i;
-
- ZFS_ENTER(zfsvfs);
-
- if (fidp->fid_len < SHORT_FID_LEN) {
- fidp->fid_len = SHORT_FID_LEN;
- ZFS_EXIT(zfsvfs);
- return (ENOSPC);
- }
-
- zfid = (zfid_short_t *)fidp;
-
- zfid->zf_len = SHORT_FID_LEN;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- /* .zfs znodes always have a generation number of 0 */
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = 0;
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem. We use the following scheme:
- *
- * ENTRY ZFSCTL_INODE
- * .zfs 1
- * .zfs/snapshot 2
- * .zfs/snapshot/<snap> objectid(snap)
- */
-
-#define ZFSCTL_INO_SNAP(id) (id)
-
-/*
- * Get root directory attributes.
- */
-/* ARGSUSED */
-static int
-zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-
- ZFS_ENTER(zfsvfs);
- vap->va_nodeid = ZFSCTL_INO_ROOT;
- vap->va_nlink = vap->va_size = NROOT_ENTRIES;
-
- zfsctl_common_getattr(vp, vap);
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
-
- /*
- * No extended attributes allowed under .zfs
- */
- if (flags & LOOKUP_XATTR)
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
-
- if (strcmp(nm, "..") == 0) {
- err = VFS_ROOT(dvp->v_vfsp, vpp);
- } else {
- err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
- cr, ct, direntflags, realpnp);
- }
-
- ZFS_EXIT(zfsvfs);
-
- return (err);
-}
-
-static const fs_operation_def_t zfsctl_tops_root[] = {
- { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
- { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
- { VOPNAME_IOCTL, { .error = fs_inval } },
- { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } },
- { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
- { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } },
- { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } },
- { VOPNAME_SEEK, { .vop_seek = fs_seek } },
- { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
- { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
- { NULL }
-};
-
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
- objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
- dmu_objset_name(os, zname);
- if (strlen(zname) + 1 + strlen(name) >= len)
- return (ENAMETOOLONG);
- (void) strcat(zname, "@");
- (void) strcat(zname, name);
- return (0);
-}
-
-static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
-{
- vnode_t *svp = sep->se_root;
- int error;
-
- ASSERT(vn_ismntpt(svp));
-
- /* this will be dropped by dounmount() */
- if ((error = vn_vfswlock(svp)) != 0)
- return (error);
-
- VN_HOLD(svp);
- error = dounmount(vn_mountedvfs(svp), fflags, cr);
- if (error) {
- VN_RELE(svp);
- return (error);
- }
- VFS_RELE(svp->v_vfsp);
- /*
- * We can't use VN_RELE(), as that will try to invoke
- * zfsctl_snapdir_inactive(), which would cause us to destroy
- * the sd_lock mutex held by our caller.
- */
- ASSERT(svp->v_count == 1);
- gfs_vop_inactive(svp, cr, NULL);
-
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
-
- return (0);
-}
-
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
-{
- avl_index_t where;
- vfs_t *vfsp;
- refstr_t *pathref;
- char newpath[MAXNAMELEN];
- char *tail;
-
- ASSERT(MUTEX_HELD(&sdp->sd_lock));
- ASSERT(sep != NULL);
-
- vfsp = vn_mountedvfs(sep->se_root);
- ASSERT(vfsp != NULL);
-
- vfs_lock_wait(vfsp);
-
- /*
- * Change the name in the AVL tree.
- */
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
- (void) strcpy(sep->se_name, nm);
- VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
- avl_insert(&sdp->sd_snaps, sep, where);
-
- /*
- * Change the current mountpoint info:
- * - update the tail of the mntpoint path
- * - update the tail of the resource path
- */
- pathref = vfs_getmntpoint(vfsp);
- (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
- VERIFY((tail = strrchr(newpath, '/')) != NULL);
- *(tail+1) = '\0';
- ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
- (void) strcat(newpath, nm);
- refstr_rele(pathref);
- vfs_setmntpoint(vfsp, newpath);
-
- pathref = vfs_getresource(vfsp);
- (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
- VERIFY((tail = strrchr(newpath, '@')) != NULL);
- *(tail+1) = '\0';
- ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
- (void) strcat(newpath, nm);
- refstr_rele(pathref);
- vfs_setresource(vfsp, newpath);
-
- vfs_unlock(vfsp);
-}
-
-/*ARGSUSED*/
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
- cred_t *cr, caller_context_t *ct, int flags)
-{
- zfsctl_snapdir_t *sdp = sdvp->v_data;
- zfs_snapentry_t search, *sep;
- zfsvfs_t *zfsvfs;
- avl_index_t where;
- char from[MAXNAMELEN], to[MAXNAMELEN];
- char real[MAXNAMELEN];
- int err;
-
- zfsvfs = sdvp->v_vfsp->vfs_data;
- ZFS_ENTER(zfsvfs);
-
- if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
- err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
- MAXNAMELEN, NULL);
- if (err == 0) {
- snm = real;
- } else if (err != ENOTSUP) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- }
-
- ZFS_EXIT(zfsvfs);
-
- err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
- if (!err)
- err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
- if (!err)
- err = zfs_secpolicy_rename_perms(from, to, cr);
- if (err)
- return (err);
-
- /*
- * Cannot move snapshots out of the snapdir.
- */
- if (sdvp != tdvp)
- return (EINVAL);
-
- if (strcmp(snm, tnm) == 0)
- return (0);
-
- mutex_enter(&sdp->sd_lock);
-
- search.se_name = (char *)snm;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
- mutex_exit(&sdp->sd_lock);
- return (ENOENT);
- }
-
- err = dmu_objset_rename(from, to, B_FALSE);
- if (err == 0)
- zfsctl_rename_snap(sdp, sep, tnm);
-
- mutex_exit(&sdp->sd_lock);
-
- return (err);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
- caller_context_t *ct, int flags)
-{
- zfsctl_snapdir_t *sdp = dvp->v_data;
- zfs_snapentry_t *sep;
- zfs_snapentry_t search;
- zfsvfs_t *zfsvfs;
- char snapname[MAXNAMELEN];
- char real[MAXNAMELEN];
- int err;
-
- zfsvfs = dvp->v_vfsp->vfs_data;
- ZFS_ENTER(zfsvfs);
-
- if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-
- err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
- MAXNAMELEN, NULL);
- if (err == 0) {
- name = real;
- } else if (err != ENOTSUP) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- }
-
- ZFS_EXIT(zfsvfs);
-
- err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
- if (!err)
- err = zfs_secpolicy_destroy_perms(snapname, cr);
- if (err)
- return (err);
-
- mutex_enter(&sdp->sd_lock);
-
- search.se_name = name;
- sep = avl_find(&sdp->sd_snaps, &search, NULL);
- if (sep) {
- avl_remove(&sdp->sd_snaps, sep);
- err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
- if (err)
- avl_add(&sdp->sd_snaps, sep);
- else
- err = dmu_objset_destroy(snapname);
- } else {
- err = ENOENT;
- }
-
- mutex_exit(&sdp->sd_lock);
-
- return (err);
-}
-
-/*
- * This creates a snapshot under '.zfs/snapshot'.
- */
-/* ARGSUSED */
-static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
- cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
-{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- char name[MAXNAMELEN];
- int err;
- static enum symfollow follow = NO_FOLLOW;
- static enum uio_seg seg = UIO_SYSSPACE;
-
- dmu_objset_name(zfsvfs->z_os, name);
-
- *vpp = NULL;
-
- err = zfs_secpolicy_snapshot_perms(name, cr);
- if (err)
- return (err);
-
- if (err == 0) {
- err = dmu_objset_snapshot(name, dirname, B_FALSE);
- if (err)
- return (err);
- err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
- }
-
- return (err);
-}
-
-/*
- * Lookup entry point for the 'snapshot' directory. Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
- */
-/* ARGSUSED */
-static int
-zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp)
-{
- zfsctl_snapdir_t *sdp = dvp->v_data;
- objset_t *snap;
- char snapname[MAXNAMELEN];
- char real[MAXNAMELEN];
- char *mountpoint;
- zfs_snapentry_t *sep, search;
- struct mounta margs;
- vfs_t *vfsp;
- size_t mountpoint_len;
- avl_index_t where;
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
-
- /*
- * No extended attributes allowed under .zfs
- */
- if (flags & LOOKUP_XATTR)
- return (EINVAL);
-
- ASSERT(dvp->v_type == VDIR);
-
- if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
- return (0);
-
- /*
- * If we get a recursive call, that means we got called
- * from the domount() code while it was trying to look up the
- * spec (which looks like a local path for zfs). We need to
- * add some flag to domount() to tell it not to do this lookup.
- */
- if (MUTEX_HELD(&sdp->sd_lock))
- return (ENOENT);
-
- ZFS_ENTER(zfsvfs);
-
- if (flags & FIGNORECASE) {
- boolean_t conflict = B_FALSE;
-
- err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
- MAXNAMELEN, &conflict);
- if (err == 0) {
- nm = real;
- } else if (err != ENOTSUP) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- if (realpnp)
- (void) strlcpy(realpnp->pn_buf, nm,
- realpnp->pn_bufsize);
- if (conflict && direntflags)
- *direntflags = ED_CASE_CONFLICT;
- }
-
- mutex_enter(&sdp->sd_lock);
- search.se_name = (char *)nm;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
- *vpp = sep->se_root;
- VN_HOLD(*vpp);
- err = traverse(vpp);
- if (err) {
- VN_RELE(*vpp);
- *vpp = NULL;
- } else if (*vpp == sep->se_root) {
- /*
- * The snapshot was unmounted behind our backs,
- * try to remount it.
- */
- goto domount;
- } else {
- /*
- * VROOT was set during the traverse call. We need
- * to clear it since we're pretending to be part
- * of our parent's vfs.
- */
- (*vpp)->v_flag &= ~VROOT;
- }
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- /*
- * The requested snapshot is not currently mounted, look it up.
- */
- err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
- if (err) {
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- if (dmu_objset_open(snapname, DMU_OST_ZFS,
- DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (ENOENT);
- }
-
- sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
- sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
- (void) strcpy(sep->se_name, nm);
- *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
- avl_insert(&sdp->sd_snaps, sep, where);
-
- dmu_objset_close(snap);
-domount:
- mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
- strlen("/.zfs/snapshot/") + strlen(nm) + 1;
- mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
- (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
- refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
-
- margs.spec = snapname;
- margs.dir = mountpoint;
- margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
- margs.fstype = "zfs";
- margs.dataptr = NULL;
- margs.datalen = 0;
- margs.optptr = NULL;
- margs.optlen = 0;
-
- err = domount("zfs", &margs, *vpp, kcred, &vfsp);
- kmem_free(mountpoint, mountpoint_len);
-
- if (err == 0) {
- /*
- * Return the mounted root rather than the covered mount point.
- * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
- * the ZFS vnode mounted on top of the GFS node. This ZFS
- * vnode is the root the newly created vfsp.
- */
- VFS_RELE(vfsp);
- err = traverse(vpp);
- }
-
- if (err == 0) {
- /*
- * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
- *
- * This is where we lie about our v_vfsp in order to
- * make .zfs/snapshot/<snapname> accessible over NFS
- * without requiring manual mounts of <snapname>.
- */
- ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
- VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
- (*vpp)->v_vfsp = zfsvfs->z_vfs;
- (*vpp)->v_flag &= ~VROOT;
- }
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
-
- /*
- * If we had an error, drop our hold on the vnode and
- * zfsctl_snapshot_inactive() will clean up.
- */
- if (err) {
- VN_RELE(*vpp);
- *vpp = NULL;
- }
- return (err);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
- offset_t *offp, offset_t *nextp, void *data, int flags)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- char snapname[MAXNAMELEN];
- uint64_t id, cookie;
- boolean_t case_conflict;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- cookie = *offp;
- error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
- &cookie, &case_conflict);
- if (error) {
- ZFS_EXIT(zfsvfs);
- if (error == ENOENT) {
- *eofp = 1;
- return (0);
- }
- return (error);
- }
-
- if (flags & V_RDDIR_ENTFLAGS) {
- edirent_t *eodp = dp;
-
- (void) strcpy(eodp->ed_name, snapname);
- eodp->ed_ino = ZFSCTL_INO_SNAP(id);
- eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
- } else {
- struct dirent64 *odp = dp;
-
- (void) strcpy(odp->d_name, snapname);
- odp->d_ino = ZFSCTL_INO_SNAP(id);
- }
- *nextp = cookie;
-
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
- */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
-{
- vnode_t *vp;
- zfsctl_snapdir_t *sdp;
-
- vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
- zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
- zfsctl_snapdir_readdir_cb, NULL);
- sdp = vp->v_data;
- sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
- sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
- mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&sdp->sd_snaps, snapentry_compare,
- sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
- return (vp);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_snapdir_t *sdp = vp->v_data;
-
- ZFS_ENTER(zfsvfs);
- zfsctl_common_getattr(vp, vap);
- vap->va_nodeid = gfs_file_inode(vp);
- vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
- zfsctl_snapdir_t *sdp = vp->v_data;
- void *private;
-
- private = gfs_dir_inactive(vp);
- if (private != NULL) {
- ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
- mutex_destroy(&sdp->sd_lock);
- avl_destroy(&sdp->sd_snaps);
- kmem_free(private, sizeof (zfsctl_snapdir_t));
- }
-}
-
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
- { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
- { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
- { VOPNAME_IOCTL, { .error = fs_inval } },
- { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } },
- { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
- { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } },
- { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } },
- { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } },
- { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } },
- { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } },
- { VOPNAME_SEEK, { .vop_seek = fs_seek } },
- { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } },
- { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
- { NULL }
-};
-
-/*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot. This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
- */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
- vnode_t *vp;
- zfsctl_node_t *zcp;
-
- vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
- zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
- zcp = vp->v_data;
- zcp->zc_id = objset;
- VFS_HOLD(vp->v_vfsp);
-
- return (vp);
-}
-
-static void
-zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep, *next;
- vnode_t *dvp;
-
- VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
-
- if (vp->v_count > 1) {
- mutex_exit(&sdp->sd_lock);
- return;
- }
- ASSERT(!vn_ismntpt(vp));
-
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- next = AVL_NEXT(&sdp->sd_snaps, sep);
-
- if (sep->se_root == vp) {
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
- break;
- }
- sep = next;
- }
- ASSERT(sep != NULL);
-
- mutex_exit(&sdp->sd_lock);
- VN_RELE(dvp);
- VFS_RELE(vp->v_vfsp);
-
- /*
- * Dispose of the vnode for the snapshot mount point.
- * This is safe to do because once this entry has been removed
- * from the AVL tree, it can't be found again, so cannot become
- * "active". If we lookup the same name again we will end up
- * creating a new vnode.
- */
- gfs_vop_inactive(vp, cr, ct);
-}
-
-
-/*
- * These VP's should never see the light of day. They should always
- * be covered.
- */
-static const fs_operation_def_t zfsctl_tops_snapshot[] = {
- VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapshot_inactive },
- NULL, NULL
-};
-
-int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- vnode_t *dvp, *vp;
- zfsctl_snapdir_t *sdp;
- zfsctl_node_t *zcp;
- zfs_snapentry_t *sep;
- int error;
-
- ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, kcred, NULL, NULL, NULL);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- vp = sep->se_root;
- zcp = vp->v_data;
- if (zcp->zc_id == objsetid)
- break;
-
- sep = AVL_NEXT(&sdp->sd_snaps, sep);
- }
-
- if (sep != NULL) {
- VN_HOLD(vp);
- /*
- * Return the mounted root rather than the covered mount point.
- * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
- * and returns the ZFS vnode mounted on top of the GFS node.
- * This ZFS vnode is the root of the vfs for objset 'objsetid'.
- */
- error = traverse(&vp);
- if (error == 0) {
- if (vp == sep->se_root)
- error = EINVAL;
- else
- *zfsvfsp = VTOZ(vp)->z_zfsvfs;
- }
- mutex_exit(&sdp->sd_lock);
- VN_RELE(vp);
- } else {
- error = EINVAL;
- mutex_exit(&sdp->sd_lock);
- }
-
- VN_RELE(dvp);
-
- return (error);
-}
-
-/*
- * Unmount any snapshots for the given filesystem. This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
- */
-int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- vnode_t *dvp;
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep, *next;
- int error;
-
- ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, cr, NULL, NULL, NULL);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
-
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- next = AVL_NEXT(&sdp->sd_snaps, sep);
-
- /*
- * If this snapshot is not mounted, then it must
- * have just been unmounted by somebody else, and
- * will be cleaned up by zfsctl_snapdir_inactive().
- */
- if (vn_ismntpt(sep->se_root)) {
- avl_remove(&sdp->sd_snaps, sep);
- error = zfsctl_unmount_snap(sep, fflags, cr);
- if (error) {
- avl_add(&sdp->sd_snaps, sep);
- break;
- }
- }
- sep = next;
- }
-
- mutex_exit(&sdp->sd_lock);
- VN_RELE(dvp);
-
- return (error);
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_dir.c b/zfs/lib/libdmu-ctl/zfs_dir.c
deleted file mode 100644
index 6f22e2ad1..000000000
--- a/zfs/lib/libdmu-ctl/zfs_dir.c
+++ /dev/null
@@ -1,968 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_dir.c 1.25 08/04/27 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/mode.h>
-#include <sys/kmem.h>
-#include <sys/uio.h>
-#include <sys/pathname.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/unistd.h>
-#include <sys/sunddi.h>
-#include <sys/random.h>
-#include <sys/policy.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/fs/zfs.h>
-#include "fs/fs_subr.h"
-#include <sys/zap.h>
-#include <sys/dmu.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/dnlc.h>
-#include <sys/extdirent.h>
-
-/*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
- * of names after deciding which is the appropriate lookup interface.
- */
-static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
- boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
-{
- int error;
-
- if (zfsvfs->z_norm) {
- matchtype_t mt = MT_FIRST;
- boolean_t conflict = B_FALSE;
- size_t bufsz = 0;
- char *buf = NULL;
-
- if (rpnp) {
- buf = rpnp->pn_buf;
- bufsz = rpnp->pn_bufsize;
- }
- if (exact)
- mt = MT_EXACT;
- /*
- * In the non-mixed case we only expect there would ever
- * be one match, but we need to use the normalizing lookup.
- */
- error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
- zoid, mt, buf, bufsz, &conflict);
- if (!error && deflags)
- *deflags = conflict ? ED_CASE_CONFLICT : 0;
- } else {
- error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
- }
- *zoid = ZFS_DIRENT_OBJ(*zoid);
-
- if (error == ENOENT && update)
- dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
- return (error);
-}
-
-/*
- * Lock a directory entry. A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object. As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
- *
- * Input arguments:
- * dzp - znode for directory
- * name - name of entry to lock
- * flag - ZNEW: if the entry already exists, fail with EEXIST.
- * ZEXISTS: if the entry does not exist, fail with ENOENT.
- * ZSHARED: allow concurrent access with other ZSHARED callers.
- * ZXATTR: we want dzp's xattr directory
- * ZCILOOK: On a mixed sensitivity file system,
- * this lookup should be case-insensitive.
- * ZCIEXACT: On a purely case-insensitive file system,
- * this lookup should be case-sensitive.
- * ZRENAMING: we are locking for renaming, force narrow locks
- *
- * Output arguments:
- * zpp - pointer to the znode for the entry (NULL if there isn't one)
- * dlpp - pointer to the dirlock for this entry (NULL on error)
- * direntflags - (case-insensitive lookup only)
- * flags if multiple case-sensitive matches exist in directory
- * realpnp - (case-insensitive lookup only)
- * actual name matched within the directory
- *
- * Return value: 0 on success or errno on failure.
- *
- * NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- * but return znode pointers to a single match.
- */
-int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
- int flag, int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t *dl;
- boolean_t update;
- boolean_t exact;
- uint64_t zoid;
- vnode_t *vp = NULL;
- int error = 0;
- int cmpflags;
-
- *zpp = NULL;
- *dlpp = NULL;
-
- /*
- * Verify that we are not trying to lock '.', '..', or '.zfs'
- */
- if (name[0] == '.' &&
- (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
- zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
- return (EEXIST);
-
- /*
- * Case sensitivity and normalization preferences are set when
- * the file system is created. These are stored in the
- * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
- * affect what vnodes can be cached in the DNLC, how we
- * perform zap lookups, and the "width" of our dirlocks.
- *
- * A normal dirlock locks a single name. Note that with
- * normalization a name can be composed multiple ways, but
- * when normalized, these names all compare equal. A wide
- * dirlock locks multiple names. We need these when the file
- * system is supporting mixed-mode access. It is sometimes
- * necessary to lock all case permutations of file name at
- * once so that simultaneous case-insensitive/case-sensitive
- * behaves as rationally as possible.
- */
-
- /*
- * Decide if exact matches should be requested when performing
- * a zap lookup on file systems supporting case-insensitive
- * access.
- */
- exact =
- ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
- ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
- /*
- * Only look in or update the DNLC if we are looking for the
- * name on a file system that does not require normalization
- * or case folding. We can also look there if we happen to be
- * on a non-normalizing, mixed sensitivity file system IF we
- * are looking for the exact name.
- *
- * Maybe can add TO-UPPERed version of name to dnlc in ci-only
- * case for performance improvement?
- */
- update = !zfsvfs->z_norm ||
- ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
-
- /*
- * ZRENAMING indicates we are in a situation where we should
- * take narrow locks regardless of the file system's
- * preferences for normalizing and case folding. This will
- * prevent us deadlocking trying to grab the same wide lock
- * twice if the two names happen to be case-insensitive
- * matches.
- */
- if (flag & ZRENAMING)
- cmpflags = 0;
- else
- cmpflags = zfsvfs->z_norm;
-
- /*
- * Wait until there are no locks on this name.
- */
- rw_enter(&dzp->z_name_lock, RW_READER);
- mutex_enter(&dzp->z_lock);
- for (;;) {
- if (dzp->z_unlinked) {
- mutex_exit(&dzp->z_lock);
- rw_exit(&dzp->z_name_lock);
- return (ENOENT);
- }
- for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
- if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
- U8_UNICODE_LATEST, &error) == 0) || error != 0)
- break;
- }
- if (error != 0) {
- mutex_exit(&dzp->z_lock);
- rw_exit(&dzp->z_name_lock);
- return (ENOENT);
- }
- if (dl == NULL) {
- /*
- * Allocate a new dirlock and add it to the list.
- */
- dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
- cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
- dl->dl_name = name;
- dl->dl_sharecnt = 0;
- dl->dl_namesize = 0;
- dl->dl_dzp = dzp;
- dl->dl_next = dzp->z_dirlocks;
- dzp->z_dirlocks = dl;
- break;
- }
- if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
- break;
- cv_wait(&dl->dl_cv, &dzp->z_lock);
- }
-
- if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
- /*
- * We're the second shared reference to dl. Make a copy of
- * dl_name in case the first thread goes away before we do.
- * Note that we initialize the new name before storing its
- * pointer into dl_name, because the first thread may load
- * dl->dl_name at any time. He'll either see the old value,
- * which is his, or the new shared copy; either is OK.
- */
- dl->dl_namesize = strlen(dl->dl_name) + 1;
- name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
- bcopy(dl->dl_name, name, dl->dl_namesize);
- dl->dl_name = name;
- }
-
- mutex_exit(&dzp->z_lock);
-
- /*
- * We have a dirlock on the name. (Note that it is the dirlock,
- * not the dzp's z_lock, that protects the name in the zap object.)
- * See if there's an object by this name; if so, put a hold on it.
- */
- if (flag & ZXATTR) {
- zoid = dzp->z_phys->zp_xattr;
- error = (zoid == 0 ? ENOENT : 0);
- } else {
- if (update)
- vp = dnlc_lookup(ZTOV(dzp), name);
- if (vp == DNLC_NO_VNODE) {
- VN_RELE(vp);
- error = ENOENT;
- } else if (vp) {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- return (EEXIST);
- }
- *dlpp = dl;
- *zpp = VTOZ(vp);
- return (0);
- } else {
- error = zfs_match_find(zfsvfs, dzp, name, exact,
- update, direntflags, realpnp, &zoid);
- }
- }
- if (error) {
- if (error != ENOENT || (flag & ZEXISTS)) {
- zfs_dirent_unlock(dl);
- return (error);
- }
- } else {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- return (EEXIST);
- }
- error = zfs_zget(zfsvfs, zoid, zpp);
- if (error) {
- zfs_dirent_unlock(dl);
- return (error);
- }
- if (!(flag & ZXATTR) && update)
- dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
- }
-
- *dlpp = dl;
-
- return (0);
-}
-
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
-{
- znode_t *dzp = dl->dl_dzp;
- zfs_dirlock_t **prev_dl, *cur_dl;
-
- mutex_enter(&dzp->z_lock);
- rw_exit(&dzp->z_name_lock);
- if (dl->dl_sharecnt > 1) {
- dl->dl_sharecnt--;
- mutex_exit(&dzp->z_lock);
- return;
- }
- prev_dl = &dzp->z_dirlocks;
- while ((cur_dl = *prev_dl) != dl)
- prev_dl = &cur_dl->dl_next;
- *prev_dl = dl->dl_next;
- cv_broadcast(&dl->dl_cv);
- mutex_exit(&dzp->z_lock);
-
- if (dl->dl_namesize != 0)
- kmem_free(dl->dl_name, dl->dl_namesize);
- cv_destroy(&dl->dl_cv);
- kmem_free(dl, sizeof (*dl));
-}
-
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- * no directory entries are actually stored for them. If this is
- * the root of a filesystem, then '.zfs' is also treated as a
- * special pseudo-directory.
- */
-int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
- int *deflg, pathname_t *rpnp)
-{
- zfs_dirlock_t *dl;
- znode_t *zp;
- int error = 0;
-
- if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
- *vpp = ZTOV(dzp);
- VN_HOLD(*vpp);
- } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- /*
- * If we are a snapshot mounted under .zfs, return
- * the vp for the snapshot directory.
- */
- if (dzp->z_phys->zp_parent == dzp->z_id &&
- zfsvfs->z_parent != zfsvfs) {
- error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
- "snapshot", vpp, NULL, 0, NULL, kcred,
- NULL, NULL, NULL);
- return (error);
- }
- rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
- if (error == 0)
- *vpp = ZTOV(zp);
- rw_exit(&dzp->z_parent_lock);
- } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
- *vpp = zfsctl_root(dzp);
- } else {
- int zf;
-
- zf = ZEXISTS | ZSHARED;
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
- if (error == 0) {
- *vpp = ZTOV(zp);
- zfs_dirent_unlock(dl);
- dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
- }
- rpnp = NULL;
- }
-
- if ((flags & FIGNORECASE) && rpnp && !error)
- (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
- return (error);
-}
-
-static char *
-zfs_unlinked_hexname(char namebuf[17], uint64_t x)
-{
- char *name = &namebuf[16];
- const char digits[16] = "0123456789abcdef";
-
- *name = '\0';
- do {
- *--name = digits[x & 0xf];
- x >>= 4;
- } while (x != 0);
-
- return (name);
-}
-
-/*
- * unlinked Set (formerly known as the "delete queue") Error Handling
- *
- * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
- * don't specify the name of the entry that we will be manipulating. We
- * also fib and say that we won't be adding any new entries to the
- * unlinked set, even though we might (this is to lower the minimum file
- * size that can be deleted in a full filesystem). So on the small
- * chance that the nlink list is using a fat zap (ie. has more than
- * 2000 entries), we *may* not pre-read a block that's needed.
- * Therefore it is remotely possible for some of the assertions
- * regarding the unlinked set below to fail due to i/o error. On a
- * nondebug system, this will result in the space being leaked.
- */
-void
-zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- char obj_name[17];
- int error;
-
- ASSERT(zp->z_unlinked);
- ASSERT3U(zp->z_phys->zp_links, ==, 0);
-
- error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
- zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
- ASSERT3U(error, ==, 0);
-}
-
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
-{
- zap_cursor_t zc;
- zap_attribute_t zap;
- dmu_object_info_t doi;
- znode_t *zp;
- int error;
-
- /*
- * Interate over the contents of the unlinked set.
- */
- for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
- zap_cursor_retrieve(&zc, &zap) == 0;
- zap_cursor_advance(&zc)) {
-
- /*
- * See what kind of object we have in list
- */
-
- error = dmu_object_info(zfsvfs->z_os,
- zap.za_first_integer, &doi);
- if (error != 0)
- continue;
-
- ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
- (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
- /*
- * We need to re-mark these list entries for deletion,
- * so we pull them back into core and set zp->z_unlinked.
- */
- error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
-
- /*
- * We may pick up znodes that are already marked for deletion.
- * This could happen during the purge of an extended attribute
- * directory. All we need to do is skip over them, since they
- * are already in the system marked z_unlinked.
- */
- if (error != 0)
- continue;
-
- zp->z_unlinked = B_TRUE;
- VN_RELE(ZTOV(zp));
- }
- zap_cursor_fini(&zc);
-}
-
-/*
- * Delete the entire contents of a directory. Return a count
- * of the number of entries that could not be deleted. If we encounter
- * an error, return a count of at least one so that the directory stays
- * in the unlinked set.
- *
- * NOTE: this function assumes that the directory is inactive,
- * so there is no need to lock its entries before deletion.
- * Also, it assumes the directory contents is *only* regular
- * files.
- */
-static int
-zfs_purgedir(znode_t *dzp)
-{
- zap_cursor_t zc;
- zap_attribute_t zap;
- znode_t *xzp;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t dl;
- int skipped = 0;
- int error;
-
- for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
- (error = zap_cursor_retrieve(&zc, &zap)) == 0;
- zap_cursor_advance(&zc)) {
- error = zfs_zget(zfsvfs,
- ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
- if (error) {
- skipped += 1;
- continue;
- }
-
- ASSERT((ZTOV(xzp)->v_type == VREG) ||
- (ZTOV(xzp)->v_type == VLNK));
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
- dmu_tx_hold_bonus(tx, xzp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- VN_RELE(ZTOV(xzp));
- skipped += 1;
- continue;
- }
- bzero(&dl, sizeof (dl));
- dl.dl_dzp = dzp;
- dl.dl_name = zap.za_name;
-
- error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
- if (error)
- skipped += 1;
- dmu_tx_commit(tx);
-
- VN_RELE(ZTOV(xzp));
- }
- zap_cursor_fini(&zc);
- if (error != ENOENT)
- skipped += 1;
- return (skipped);
-}
-
-void
-zfs_rmnode(znode_t *zp)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os = zfsvfs->z_os;
- znode_t *xzp = NULL;
- char obj_name[17];
- dmu_tx_t *tx;
- uint64_t acl_obj;
- int error;
-
- ASSERT(ZTOV(zp)->v_count == 0);
- ASSERT(zp->z_phys->zp_links == 0);
-
- /*
- * If this is an attribute directory, purge its contents.
- */
- if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
- if (zfs_purgedir(zp) != 0) {
- /*
- * Not enough space to delete some xattrs.
- * Leave it on the unlinked set.
- */
- zfs_znode_dmu_fini(zp);
- zfs_znode_free(zp);
- return;
- }
- }
-
- /*
- * If the file has extended attributes, we're going to unlink
- * the xattr dir.
- */
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT(error == 0);
- }
-
- acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
-
- /*
- * Set up the transaction.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- if (xzp) {
- dmu_tx_hold_bonus(tx, xzp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
- }
- if (acl_obj)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- /*
- * Not enough space to delete the file. Leave it in the
- * unlinked set, leaking it until the fs is remounted (at
- * which point we'll call zfs_unlinked_drain() to process it).
- */
- dmu_tx_abort(tx);
- zfs_znode_dmu_fini(zp);
- zfs_znode_free(zp);
- goto out;
- }
-
- if (xzp) {
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
- xzp->z_phys->zp_links = 0; /* no more links to it */
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
- }
-
- /* Remove this znode from the unlinked set */
- error = zap_remove(os, zfsvfs->z_unlinkedobj,
- zfs_unlinked_hexname(obj_name, zp->z_id), tx);
- ASSERT3U(error, ==, 0);
-
- zfs_znode_delete(zp, tx);
-
- dmu_tx_commit(tx);
-out:
- if (xzp)
- VN_RELE(ZTOV(xzp));
-}
-
-static uint64_t
-zfs_dirent(znode_t *zp)
-{
- uint64_t de = zp->z_id;
- if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
- de |= IFTODT((zp)->z_phys->zp_mode) << 60;
- return (de);
-}
-
-/*
- * Link zp into dl. Can only fail if zp has been unlinked.
- */
-int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
-{
- znode_t *dzp = dl->dl_dzp;
- vnode_t *vp = ZTOV(zp);
- uint64_t value;
- int zp_is_dir = (vp->v_type == VDIR);
- int error;
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- mutex_enter(&zp->z_lock);
-
- if (!(flag & ZRENAMING)) {
- if (zp->z_unlinked) { /* no new links to unlinked zp */
- ASSERT(!(flag & (ZNEW | ZEXISTS)));
- mutex_exit(&zp->z_lock);
- return (ENOENT);
- }
- zp->z_phys->zp_links++;
- }
- zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
-
- if (!(flag & ZNEW))
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
- mutex_exit(&zp->z_lock);
-
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
- mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size++; /* one dirent added */
- dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
- mutex_exit(&dzp->z_lock);
-
- value = zfs_dirent(zp);
- error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
- 8, 1, &value, tx);
- ASSERT(error == 0);
-
- dnlc_update(ZTOV(dzp), dl->dl_name, vp);
-
- return (0);
-}
-
-/*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
- * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
- * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
- * If it's non-NULL, we use it to indicate whether the znode needs deletion,
- * and it's the caller's job to do it.
- */
-int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
- boolean_t *unlinkedp)
-{
- znode_t *dzp = dl->dl_dzp;
- vnode_t *vp = ZTOV(zp);
- int zp_is_dir = (vp->v_type == VDIR);
- boolean_t unlinked = B_FALSE;
- int error;
-
- dnlc_remove(ZTOV(dzp), dl->dl_name);
-
- if (!(flag & ZRENAMING)) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
- if (vn_vfswlock(vp)) /* prevent new mounts on zp */
- return (EBUSY);
-
- if (vn_ismntpt(vp)) { /* don't remove mount point */
- vn_vfsunlock(vp);
- return (EBUSY);
- }
-
- mutex_enter(&zp->z_lock);
- if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
- return (EEXIST);
- }
- if (zp->z_phys->zp_links <= zp_is_dir) {
- zfs_panic_recover("zfs: link count on %s is %u, "
- "should be at least %u",
- zp->z_vnode->v_path ? zp->z_vnode->v_path :
- "<unknown>", (int)zp->z_phys->zp_links,
- zp_is_dir + 1);
- zp->z_phys->zp_links = zp_is_dir + 1;
- }
- if (--zp->z_phys->zp_links == zp_is_dir) {
- zp->z_unlinked = B_TRUE;
- zp->z_phys->zp_links = 0;
- unlinked = B_TRUE;
- } else {
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
- }
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
- }
-
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
- mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size--; /* one dirent removed */
- dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
- mutex_exit(&dzp->z_lock);
-
- if (zp->z_zfsvfs->z_norm) {
- if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
- (flag & ZCIEXACT)) ||
- ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(flag & ZCILOOK)))
- error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_EXACT, tx);
- else
- error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_FIRST, tx);
- } else {
- error = zap_remove(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, tx);
- }
- ASSERT(error == 0);
-
- if (unlinkedp != NULL)
- *unlinkedp = unlinked;
- else if (unlinked)
- zfs_unlinked_add(zp, tx);
-
- return (0);
-}
-
-/*
- * Indicate whether the directory is empty. Works with or without z_lock
- * held, but can only be consider a hint in the latter case. Returns true
- * if only "." and ".." remain and there's no work in progress.
- */
-boolean_t
-zfs_dirempty(znode_t *dzp)
-{
- return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
-}
-
-int
-zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_t *xzp;
- dmu_tx_t *tx;
- int error;
- zfs_fuid_info_t *fuidp = NULL;
-
- *xvpp = NULL;
-
- if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
- return (error);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- return (error);
- }
- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
- ASSERT(xzp->z_phys->zp_parent == zp->z_id);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_xattr = xzp->z_id;
-
- (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
- xzp, "", NULL, fuidp, vap);
- if (fuidp)
- zfs_fuid_info_free(fuidp);
- dmu_tx_commit(tx);
-
- *xvpp = ZTOV(xzp);
-
- return (0);
-}
-
-/*
- * Return a znode for the extended attribute directory for zp.
- * ** If the directory does not already exist, it is created **
- *
- * IN: zp - znode to obtain attribute directory from
- * cr - credentials of caller
- * flags - flags from the VOP_LOOKUP call
- *
- * OUT: xzpp - pointer to extended attribute znode
- *
- * RETURN: 0 on success
- * error number on failure
- */
-int
-zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_t *xzp;
- zfs_dirlock_t *dl;
- vattr_t va;
- int error;
-top:
- error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
- if (error)
- return (error);
-
- if (xzp != NULL) {
- *xvpp = ZTOV(xzp);
- zfs_dirent_unlock(dl);
- return (0);
- }
-
- ASSERT(zp->z_phys->zp_xattr == 0);
-
- if (!(flags & CREATE_XATTR_DIR)) {
- zfs_dirent_unlock(dl);
- return (ENOENT);
- }
-
- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- zfs_dirent_unlock(dl);
- return (EROFS);
- }
-
- /*
- * The ability to 'create' files in an attribute
- * directory comes from the write_xattr permission on the base file.
- *
- * The ability to 'search' an attribute directory requires
- * read_xattr permission on the base file.
- *
- * Once in a directory the ability to read/write attributes
- * is controlled by the permissions on the attribute file.
- */
- va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
- va.va_type = VDIR;
- va.va_mode = S_IFDIR | S_ISVTX | 0777;
- zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
-
- error = zfs_make_xattrdir(zp, &va, xvpp, cr);
- zfs_dirent_unlock(dl);
-
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- /* NB: we already did dmu_tx_wait() if necessary */
- goto top;
- }
-
- return (error);
-}
-
-/*
- * Decide whether it is okay to remove within a sticky directory.
- *
- * In sticky directories, write access is not sufficient;
- * you can remove entries from a directory only if:
- *
- * you own the directory,
- * you own the entry,
- * the entry is a plain file and you have write access,
- * or you are privileged (checked in secpolicy...).
- *
- * The function returns 0 if remove access is granted.
- */
-int
-zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
-{
- uid_t uid;
- uid_t downer;
- uid_t fowner;
- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-
- if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
- return (0);
-
- if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
- return (0);
-
- downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
- fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
-
- if ((uid = crgetuid(cr)) == downer || uid == fowner ||
- (ZTOV(zp)->v_type == VREG &&
- zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
- return (0);
- else
- return (secpolicy_vnode_remove(cr));
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_fuid.c b/zfs/lib/libdmu-ctl/zfs_fuid.c
deleted file mode 100644
index 59c9adfe2..000000000
--- a/zfs/lib/libdmu-ctl/zfs_fuid.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_fuid.c 1.5 08/01/31 SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/sunddi.h>
-#include <sys/dmu.h>
-#include <sys/avl.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/nvpair.h>
-#ifdef _KERNEL
-#include <sys/kidmap.h>
-#include <sys/sid.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_znode.h>
-#endif
-#include <sys/zfs_fuid.h>
-
-/*
- * FUID Domain table(s).
- *
- * The FUID table is stored as a packed nvlist of an array
- * of nvlists which contain an index, domain string and offset
- *
- * During file system initialization the nvlist(s) are read and
- * two AVL trees are created. One tree is keyed by the index number
- * and the other by the domain string. Nodes are never removed from
- * trees, but new entries may be added. If a new entry is added then the
- * on-disk packed nvlist will also be updated.
- */
-
-#define FUID_IDX "fuid_idx"
-#define FUID_DOMAIN "fuid_domain"
-#define FUID_OFFSET "fuid_offset"
-#define FUID_NVP_ARRAY "fuid_nvlist"
-
-typedef struct fuid_domain {
- avl_node_t f_domnode;
- avl_node_t f_idxnode;
- ksiddomain_t *f_ksid;
- uint64_t f_idx;
-} fuid_domain_t;
-
-/*
- * Compare two indexes.
- */
-static int
-idx_compare(const void *arg1, const void *arg2)
-{
- const fuid_domain_t *node1 = arg1;
- const fuid_domain_t *node2 = arg2;
-
- if (node1->f_idx < node2->f_idx)
- return (-1);
- else if (node1->f_idx > node2->f_idx)
- return (1);
- return (0);
-}
-
-/*
- * Compare two domain strings.
- */
-static int
-domain_compare(const void *arg1, const void *arg2)
-{
- const fuid_domain_t *node1 = arg1;
- const fuid_domain_t *node2 = arg2;
- int val;
-
- val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
- if (val == 0)
- return (0);
- return (val > 0 ? 1 : -1);
-}
-
-/*
- * load initial fuid domain and idx trees. This function is used by
- * both the kernel and zdb.
- */
-uint64_t
-zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
- avl_tree_t *domain_tree)
-{
- dmu_buf_t *db;
- uint64_t fuid_size;
-
- avl_create(idx_tree, idx_compare,
- sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
- avl_create(domain_tree, domain_compare,
- sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
-
- VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
- fuid_size = *(uint64_t *)db->db_data;
- dmu_buf_rele(db, FTAG);
-
- if (fuid_size) {
- nvlist_t **fuidnvp;
- nvlist_t *nvp = NULL;
- uint_t count;
- char *packed;
- int i;
-
- packed = kmem_alloc(fuid_size, KM_SLEEP);
- VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
- VERIFY(nvlist_unpack(packed, fuid_size,
- &nvp, 0) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
- &fuidnvp, &count) == 0);
-
- for (i = 0; i != count; i++) {
- fuid_domain_t *domnode;
- char *domain;
- uint64_t idx;
-
- VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
- &domain) == 0);
- VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
- &idx) == 0);
-
- domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
-
- domnode->f_idx = idx;
- domnode->f_ksid = ksid_lookupdomain(domain);
- avl_add(idx_tree, domnode);
- avl_add(domain_tree, domnode);
- }
- nvlist_free(nvp);
- kmem_free(packed, fuid_size);
- }
- return (fuid_size);
-}
-
-void
-zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
-{
- fuid_domain_t *domnode;
- void *cookie;
-
- cookie = NULL;
- while (domnode = avl_destroy_nodes(domain_tree, &cookie))
- ksiddomain_rele(domnode->f_ksid);
-
- avl_destroy(domain_tree);
- cookie = NULL;
- while (domnode = avl_destroy_nodes(idx_tree, &cookie))
- kmem_free(domnode, sizeof (fuid_domain_t));
- avl_destroy(idx_tree);
-}
-
-char *
-zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
-{
- fuid_domain_t searchnode, *findnode;
- avl_index_t loc;
-
- searchnode.f_idx = idx;
-
- findnode = avl_find(idx_tree, &searchnode, &loc);
-
- return (findnode->f_ksid->kd_name);
-}
-
-#ifdef _KERNEL
-/*
- * Load the fuid table(s) into memory.
- */
-static void
-zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
-{
- int error = 0;
-
- rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
-
- if (zfsvfs->z_fuid_loaded) {
- rw_exit(&zfsvfs->z_fuid_lock);
- return;
- }
-
- if (zfsvfs->z_fuid_obj == 0) {
-
- /* first make sure we need to allocate object */
-
- error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
- ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
- if (error == ENOENT && tx != NULL) {
- zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
- DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
- sizeof (uint64_t), tx);
- VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
- ZFS_FUID_TABLES, sizeof (uint64_t), 1,
- &zfsvfs->z_fuid_obj, tx) == 0);
- }
- }
-
- zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
- zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
-
- zfsvfs->z_fuid_loaded = B_TRUE;
- rw_exit(&zfsvfs->z_fuid_lock);
-}
-
-/*
- * Query domain table for a given domain.
- *
- * If domain isn't found it is added to AVL trees and
- * the results are pushed out to disk.
- */
-int
-zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
- dmu_tx_t *tx)
-{
- fuid_domain_t searchnode, *findnode;
- avl_index_t loc;
-
- /*
- * If the dummy "nobody" domain then return an index of 0
- * to cause the created FUID to be a standard POSIX id
- * for the user nobody.
- */
- if (domain[0] == '\0') {
- *retdomain = "";
- return (0);
- }
-
- searchnode.f_ksid = ksid_lookupdomain(domain);
- if (retdomain) {
- *retdomain = searchnode.f_ksid->kd_name;
- }
- if (!zfsvfs->z_fuid_loaded)
- zfs_fuid_init(zfsvfs, tx);
-
- rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
- findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
- rw_exit(&zfsvfs->z_fuid_lock);
-
- if (findnode) {
- ksiddomain_rele(searchnode.f_ksid);
- return (findnode->f_idx);
- } else {
- fuid_domain_t *domnode;
- nvlist_t *nvp;
- nvlist_t **fuids;
- uint64_t retidx;
- size_t nvsize = 0;
- char *packed;
- dmu_buf_t *db;
- int i = 0;
-
- domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
- domnode->f_ksid = searchnode.f_ksid;
-
- rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
- retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
-
- avl_add(&zfsvfs->z_fuid_domain, domnode);
- avl_add(&zfsvfs->z_fuid_idx, domnode);
- /*
- * Now resync the on-disk nvlist.
- */
- VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- domnode = avl_first(&zfsvfs->z_fuid_domain);
- fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
- while (domnode) {
- VERIFY(nvlist_alloc(&fuids[i],
- NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
- domnode->f_idx) == 0);
- VERIFY(nvlist_add_uint64(fuids[i],
- FUID_OFFSET, 0) == 0);
- VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
- domnode->f_ksid->kd_name) == 0);
- domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
- }
- VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
- fuids, retidx) == 0);
- for (i = 0; i != retidx; i++)
- nvlist_free(fuids[i]);
- kmem_free(fuids, retidx * sizeof (void *));
- VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
- packed = kmem_alloc(nvsize, KM_SLEEP);
- VERIFY(nvlist_pack(nvp, &packed, &nvsize,
- NV_ENCODE_XDR, KM_SLEEP) == 0);
- nvlist_free(nvp);
- zfsvfs->z_fuid_size = nvsize;
- dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
- zfsvfs->z_fuid_size, packed, tx);
- kmem_free(packed, zfsvfs->z_fuid_size);
- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
- FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- *(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
- dmu_buf_rele(db, FTAG);
-
- rw_exit(&zfsvfs->z_fuid_lock);
- return (retidx);
- }
-}
-
-/*
- * Query domain table by index, returning domain string
- *
- * Returns a pointer from an avl node of the domain string.
- *
- */
-static char *
-zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
-{
- char *domain;
-
- if (idx == 0 || !zfsvfs->z_use_fuids)
- return (NULL);
-
- if (!zfsvfs->z_fuid_loaded)
- zfs_fuid_init(zfsvfs, NULL);
-
- rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
- domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
- rw_exit(&zfsvfs->z_fuid_lock);
-
- ASSERT(domain);
- return (domain);
-}
-
-void
-zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
-{
- *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
- cr, ZFS_OWNER);
- *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
- cr, ZFS_GROUP);
-}
-
-uid_t
-zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
- cred_t *cr, zfs_fuid_type_t type)
-{
- uint32_t index = FUID_INDEX(fuid);
- char *domain;
- uid_t id;
-
- if (index == 0)
- return (fuid);
-
- domain = zfs_fuid_find_by_idx(zfsvfs, index);
- ASSERT(domain != NULL);
-
- if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
- (void) kidmap_getuidbysid(crgetzone(cr), domain,
- FUID_RID(fuid), &id);
- } else {
- (void) kidmap_getgidbysid(crgetzone(cr), domain,
- FUID_RID(fuid), &id);
- }
- return (id);
-}
-
-/*
- * Add a FUID node to the list of fuid's being created for this
- * ACL
- *
- * If ACL has multiple domains, then keep only one copy of each unique
- * domain.
- */
-static void
-zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
- uint64_t idx, uint64_t id, zfs_fuid_type_t type)
-{
- zfs_fuid_t *fuid;
- zfs_fuid_domain_t *fuid_domain;
- zfs_fuid_info_t *fuidp;
- uint64_t fuididx;
- boolean_t found = B_FALSE;
-
- if (*fuidpp == NULL)
- *fuidpp = zfs_fuid_info_alloc();
-
- fuidp = *fuidpp;
- /*
- * First find fuid domain index in linked list
- *
- * If one isn't found then create an entry.
- */
-
- for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
- fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
- fuid_domain), fuididx++) {
- if (idx == fuid_domain->z_domidx) {
- found = B_TRUE;
- break;
- }
- }
-
- if (!found) {
- fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
- fuid_domain->z_domain = domain;
- fuid_domain->z_domidx = idx;
- list_insert_tail(&fuidp->z_domains, fuid_domain);
- fuidp->z_domain_str_sz += strlen(domain) + 1;
- fuidp->z_domain_cnt++;
- }
-
- if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
- /*
- * Now allocate fuid entry and add it on the end of the list
- */
-
- fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
- fuid->z_id = id;
- fuid->z_domidx = idx;
- fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
-
- list_insert_tail(&fuidp->z_fuids, fuid);
- fuidp->z_fuid_cnt++;
- } else {
- if (type == ZFS_OWNER)
- fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
- else
- fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
- }
-}
-
-/*
- * Create a file system FUID, based on information in the users cred
- */
-uint64_t
-zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
- dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
-{
- uint64_t idx;
- ksid_t *ksid;
- uint32_t rid;
- char *kdomain;
- const char *domain;
- uid_t id;
-
- VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
-
- if (type == ZFS_OWNER)
- id = crgetuid(cr);
- else
- id = crgetgid(cr);
-
- if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
- return ((uint64_t)id);
-
- ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
-
- VERIFY(ksid != NULL);
- rid = ksid_getrid(ksid);
- domain = ksid_getdomain(ksid);
-
- idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
-
- zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
-
- return (FUID_ENCODE(idx, rid));
-}
-
-/*
- * Create a file system FUID for an ACL ace
- * or a chown/chgrp of the file.
- * This is similar to zfs_fuid_create_cred, except that
- * we can't find the domain + rid information in the
- * cred. Instead we have to query Winchester for the
- * domain and rid.
- *
- * During replay operations the domain+rid information is
- * found in the zfs_fuid_info_t that the replay code has
- * attached to the zfsvfs of the file system.
- */
-uint64_t
-zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
- zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
-{
- const char *domain;
- char *kdomain;
- uint32_t fuid_idx = FUID_INDEX(id);
- uint32_t rid;
- idmap_stat status;
- uint64_t idx;
- boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
- zfs_fuid_t *zfuid = NULL;
- zfs_fuid_info_t *fuidp;
-
- /*
- * If POSIX ID, or entry is already a FUID then
- * just return the id
- *
- * We may also be handed an already FUID'ized id via
- * chmod.
- */
-
- if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
- return (id);
-
- if (is_replay) {
- fuidp = zfsvfs->z_fuid_replay;
-
- /*
- * If we are passed an ephemeral id, but no
- * fuid_info was logged then return NOBODY.
- * This is most likely a result of idmap service
- * not being available.
- */
- if (fuidp == NULL)
- return (UID_NOBODY);
-
- switch (type) {
- case ZFS_ACE_USER:
- case ZFS_ACE_GROUP:
- zfuid = list_head(&fuidp->z_fuids);
- rid = FUID_RID(zfuid->z_logfuid);
- idx = FUID_INDEX(zfuid->z_logfuid);
- break;
- case ZFS_OWNER:
- rid = FUID_RID(fuidp->z_fuid_owner);
- idx = FUID_INDEX(fuidp->z_fuid_owner);
- break;
- case ZFS_GROUP:
- rid = FUID_RID(fuidp->z_fuid_group);
- idx = FUID_INDEX(fuidp->z_fuid_group);
- break;
- };
- domain = fuidp->z_domain_table[idx -1];
- } else {
- if (type == ZFS_OWNER || type == ZFS_ACE_USER)
- status = kidmap_getsidbyuid(crgetzone(cr), id,
- &domain, &rid);
- else
- status = kidmap_getsidbygid(crgetzone(cr), id,
- &domain, &rid);
-
- if (status != 0) {
- /*
- * When returning nobody we will need to
- * make a dummy fuid table entry for logging
- * purposes.
- */
- rid = UID_NOBODY;
- domain = "";
- }
- }
-
- idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
-
- if (!is_replay)
- zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
- else if (zfuid != NULL) {
- list_remove(&fuidp->z_fuids, zfuid);
- kmem_free(zfuid, sizeof (zfs_fuid_t));
- }
- return (FUID_ENCODE(idx, rid));
-}
-
-void
-zfs_fuid_destroy(zfsvfs_t *zfsvfs)
-{
- rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
- if (!zfsvfs->z_fuid_loaded) {
- rw_exit(&zfsvfs->z_fuid_lock);
- return;
- }
- zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
- rw_exit(&zfsvfs->z_fuid_lock);
-}
-
-/*
- * Allocate zfs_fuid_info for tracking FUIDs created during
- * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
- */
-zfs_fuid_info_t *
-zfs_fuid_info_alloc(void)
-{
- zfs_fuid_info_t *fuidp;
-
- fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
- list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
- offsetof(zfs_fuid_domain_t, z_next));
- list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
- offsetof(zfs_fuid_t, z_next));
- return (fuidp);
-}
-
-/*
- * Release all memory associated with zfs_fuid_info_t
- */
-void
-zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
-{
- zfs_fuid_t *zfuid;
- zfs_fuid_domain_t *zdomain;
-
- while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
- list_remove(&fuidp->z_fuids, zfuid);
- kmem_free(zfuid, sizeof (zfs_fuid_t));
- }
-
- if (fuidp->z_domain_table != NULL)
- kmem_free(fuidp->z_domain_table,
- (sizeof (char **)) * fuidp->z_domain_cnt);
-
- while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
- list_remove(&fuidp->z_domains, zdomain);
- kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
- }
-
- kmem_free(fuidp, sizeof (zfs_fuid_info_t));
-}
-
-/*
- * Check to see if id is a groupmember. If cred
- * has ksid info then sidlist is checked first
- * and if still not found then POSIX groups are checked
- *
- * Will use a straight FUID compare when possible.
- */
-boolean_t
-zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
-{
- ksid_t *ksid = crgetsid(cr, KSID_GROUP);
- uid_t gid;
-
- if (ksid) {
- int i;
- ksid_t *ksid_groups;
- ksidlist_t *ksidlist = crgetsidlist(cr);
- uint32_t idx = FUID_INDEX(id);
- uint32_t rid = FUID_RID(id);
-
- ASSERT(ksidlist);
- ksid_groups = ksidlist->ksl_sids;
-
- for (i = 0; i != ksidlist->ksl_nsid; i++) {
- if (idx == 0) {
- if (id != IDMAP_WK_CREATOR_GROUP_GID &&
- id == ksid_groups[i].ks_id) {
- return (B_TRUE);
- }
- } else {
- char *domain;
-
- domain = zfs_fuid_find_by_idx(zfsvfs, idx);
- ASSERT(domain != NULL);
-
- if (strcmp(domain,
- IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
- return (B_FALSE);
-
- if ((strcmp(domain,
- ksid_groups[i].ks_domain->kd_name) == 0) &&
- rid == ksid_groups[i].ks_rid)
- return (B_TRUE);
- }
- }
- }
-
- /*
- * Not found in ksidlist, check posix groups
- */
- gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
- return (groupmember(gid, cr));
-}
-#endif
diff --git a/zfs/lib/libdmu-ctl/zfs_ioctl.c b/zfs/lib/libdmu-ctl/zfs_ioctl.c
deleted file mode 100644
index e4d253474..000000000
--- a/zfs/lib/libdmu-ctl/zfs_ioctl.c
+++ /dev/null
@@ -1,3055 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_ioctl.c 1.61 08/04/27 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/buf.h>
-#include <sys/modctl.h>
-#include <sys/open.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_znode.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/dmu.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_deleg.h>
-#include <sys/dmu_objset.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/sunldi.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/nvpair.h>
-#include <sys/pathname.h>
-#include <sys/mount.h>
-#include <sys/sdt.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_dir.h>
-#include <sys/zvol.h>
-#include <sharefs/share.h>
-#include <sys/dmu_objset.h>
-
-#include "zfs_namecheck.h"
-#include "zfs_prop.h"
-#include "zfs_deleg.h"
-
-extern struct modlfs zfs_modlfs;
-
-extern void zfs_init(void);
-extern void zfs_fini(void);
-
-ldi_ident_t zfs_li = NULL;
-dev_info_t *zfs_dip;
-
-typedef int zfs_ioc_func_t(zfs_cmd_t *);
-typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *);
-
-typedef struct zfs_ioc_vec {
- zfs_ioc_func_t *zvec_func;
- zfs_secpolicy_func_t *zvec_secpolicy;
- enum {
- NO_NAME,
- POOL_NAME,
- DATASET_NAME
- } zvec_namecheck;
- boolean_t zvec_his_log;
-} zfs_ioc_vec_t;
-
-/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
-void
-__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
-{
- const char *newfile;
- char buf[256];
- va_list adx;
-
- /*
- * Get rid of annoying "../common/" prefix to filename.
- */
- newfile = strrchr(file, '/');
- if (newfile != NULL) {
- newfile = newfile + 1; /* Get rid of leading / */
- } else {
- newfile = file;
- }
-
- va_start(adx, fmt);
- (void) vsnprintf(buf, sizeof (buf), fmt, adx);
- va_end(adx);
-
- /*
- * To get this data, use the zfs-dprintf probe as so:
- * dtrace -q -n 'zfs-dprintf \
- * /stringof(arg0) == "dbuf.c"/ \
- * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
- * arg0 = file name
- * arg1 = function name
- * arg2 = line number
- * arg3 = message
- */
- DTRACE_PROBE4(zfs__dprintf,
- char *, newfile, char *, func, int, line, char *, buf);
-}
-
-static void
-history_str_free(char *buf)
-{
- kmem_free(buf, HIS_MAX_RECORD_LEN);
-}
-
-static char *
-history_str_get(zfs_cmd_t *zc)
-{
- char *buf;
-
- if (zc->zc_history == NULL)
- return (NULL);
-
- buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
- if (copyinstr((void *)(uintptr_t)zc->zc_history,
- buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
- history_str_free(buf);
- return (NULL);
- }
-
- buf[HIS_MAX_RECORD_LEN -1] = '\0';
-
- return (buf);
-}
-
-/*
- * zfs_check_version
- *
- * Return non-zero if the spa version is less than requested version.
- */
-static int
-zfs_check_version(const char *name, int version)
-{
-
- spa_t *spa;
-
- if (spa_open(name, &spa, FTAG) == 0) {
- if (spa_version(spa) < version) {
- spa_close(spa, FTAG);
- return (1);
- }
- spa_close(spa, FTAG);
- }
- return (0);
-}
-
-/*
- * zpl_check_version
- *
- * Return non-zero if the ZPL version is less than requested version.
- */
-static int
-zpl_check_version(const char *name, int version)
-{
- objset_t *os;
- int rc = 1;
-
- if (dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
- uint64_t propversion;
-
- if (zfs_get_zplprop(os, ZFS_PROP_VERSION,
- &propversion) == 0) {
- rc = !(propversion >= version);
- }
- dmu_objset_close(os);
- }
- return (rc);
-}
-
-static void
-zfs_log_history(zfs_cmd_t *zc)
-{
- spa_t *spa;
- char *buf;
-
- if ((buf = history_str_get(zc)) == NULL)
- return;
-
- if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
- if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
- (void) spa_history_log(spa, buf, LOG_CMD_NORMAL);
- spa_close(spa, FTAG);
- }
- history_str_free(buf);
-}
-
-/*
- * Policy for top-level read operations (list pools). Requires no privileges,
- * and can be used in the local zone, as there is no associated dataset.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr)
-{
- return (0);
-}
-
-/*
- * Policy for dataset read operations (list children, get statistics). Requires
- * no privileges, but must be visible in the local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
-{
- if (INGLOBALZONE(curproc) ||
- zone_dataset_visible(zc->zc_name, NULL))
- return (0);
-
- return (ENOENT);
-}
-
-static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
-{
- uint64_t zoned;
- int writable = 1;
-
- /*
- * The dataset must be visible by this zone -- check this first
- * so they don't see EPERM on something they shouldn't know about.
- */
- if (!INGLOBALZONE(curproc) &&
- !zone_dataset_visible(dataset, &writable))
- return (ENOENT);
-
- if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
- return (ENOENT);
-
- if (INGLOBALZONE(curproc)) {
- /*
- * If the fs is zoned, only root can access it from the
- * global zone.
- */
- if (secpolicy_zfs(cr) && zoned)
- return (EPERM);
- } else {
- /*
- * If we are in a local zone, the 'zoned' property must be set.
- */
- if (!zoned)
- return (EPERM);
-
- /* must be writable by this zone */
- if (!writable)
- return (EPERM);
- }
- return (0);
-}
-
-int
-zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
-{
- int error;
-
- error = zfs_dozonecheck(name, cr);
- if (error == 0) {
- error = secpolicy_zfs(cr);
- if (error)
- error = dsl_deleg_access(name, perm, cr);
- }
- return (error);
-}
-
-static int
-zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
-{
- /*
- * Check permissions for special properties.
- */
- switch (prop) {
- case ZFS_PROP_ZONED:
- /*
- * Disallow setting of 'zoned' from within a local zone.
- */
- if (!INGLOBALZONE(curproc))
- return (EPERM);
- break;
-
- case ZFS_PROP_QUOTA:
- if (!INGLOBALZONE(curproc)) {
- uint64_t zoned;
- char setpoint[MAXNAMELEN];
- /*
- * Unprivileged users are allowed to modify the
- * quota on things *under* (ie. contained by)
- * the thing they own.
- */
- if (dsl_prop_get_integer(name, "zoned", &zoned,
- setpoint))
- return (EPERM);
- if (!zoned || strlen(name) <= strlen(setpoint))
- return (EPERM);
- }
- break;
- }
-
- return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr));
-}
-
-int
-zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
-{
- int error;
-
- error = zfs_dozonecheck(zc->zc_name, cr);
- if (error)
- return (error);
-
- /*
- * permission to set permissions will be evaluated later in
- * dsl_deleg_can_allow()
- */
- return (0);
-}
-
-int
-zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
-{
- int error;
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_ROLLBACK, cr);
- if (error == 0)
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr);
- return (error);
-}
-
-int
-zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
-{
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_SEND, cr));
-}
-
-int
-zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
-{
- if (!INGLOBALZONE(curproc))
- return (EPERM);
-
- if (secpolicy_nfs(cr) == 0) {
- return (0);
- } else {
- vnode_t *vp;
- int error;
-
- if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
- NO_FOLLOW, NULL, &vp)) != 0)
- return (error);
-
- /* Now make sure mntpnt and dataset are ZFS */
-
- if (vp->v_vfsp->vfs_fstype != zfsfstype ||
- (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
- zc->zc_name) != 0)) {
- VN_RELE(vp);
- return (EPERM);
- }
-
- VN_RELE(vp);
- return (dsl_deleg_access(zc->zc_name,
- ZFS_DELEG_PERM_SHARE, cr));
- }
-}
-
-static int
-zfs_get_parent(const char *datasetname, char *parent, int parentsize)
-{
- char *cp;
-
- /*
- * Remove the @bla or /bla from the end of the name to get the parent.
- */
- (void) strncpy(parent, datasetname, parentsize);
- cp = strrchr(parent, '@');
- if (cp != NULL) {
- cp[0] = '\0';
- } else {
- cp = strrchr(parent, '/');
- if (cp == NULL)
- return (ENOENT);
- cp[0] = '\0';
- }
-
- return (0);
-}
-
-int
-zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
-{
- int error;
-
- if ((error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
- return (error);
-
- return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
-}
-
-static int
-zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
-{
- return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
-}
-
-/*
- * Must have sys_config privilege to check the iscsi permission
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
-{
- return (secpolicy_zfs(cr));
-}
-
-int
-zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
-{
- char parentname[MAXNAMELEN];
- int error;
-
- if ((error = zfs_secpolicy_write_perms(from,
- ZFS_DELEG_PERM_RENAME, cr)) != 0)
- return (error);
-
- if ((error = zfs_secpolicy_write_perms(from,
- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
- return (error);
-
- if ((error = zfs_get_parent(to, parentname,
- sizeof (parentname))) != 0)
- return (error);
-
- if ((error = zfs_secpolicy_write_perms(parentname,
- ZFS_DELEG_PERM_CREATE, cr)) != 0)
- return (error);
-
- if ((error = zfs_secpolicy_write_perms(parentname,
- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
- return (error);
-
- return (error);
-}
-
-static int
-zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
-{
- return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
-}
-
-static int
-zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
-{
- char parentname[MAXNAMELEN];
- objset_t *clone;
- int error;
-
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_PROMOTE, cr);
- if (error)
- return (error);
-
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
-
- if (error == 0) {
- dsl_dataset_t *pclone = NULL;
- dsl_dir_t *dd;
- dd = clone->os->os_dsl_dataset->ds_dir;
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- error = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_origin_obj, NULL,
- DS_MODE_NONE, FTAG, &pclone);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- if (error) {
- dmu_objset_close(clone);
- return (error);
- }
-
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr);
-
- dsl_dataset_name(pclone, parentname);
- dmu_objset_close(clone);
- dsl_dataset_close(pclone, DS_MODE_NONE, FTAG);
- if (error == 0)
- error = zfs_secpolicy_write_perms(parentname,
- ZFS_DELEG_PERM_PROMOTE, cr);
- }
- return (error);
-}
-
-static int
-zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
-{
- int error;
-
- if ((error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
- return (error);
-
- if ((error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
- return (error);
-
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_CREATE, cr));
-}
-
-int
-zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
-{
- int error;
-
- if ((error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
- return (error);
-
- error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_MOUNT, cr);
-
- return (error);
-}
-
-static int
-zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
-{
-
- return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr));
-}
-
-static int
-zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
-{
- char parentname[MAXNAMELEN];
- int error;
-
- if ((error = zfs_get_parent(zc->zc_name, parentname,
- sizeof (parentname))) != 0)
- return (error);
-
- if (zc->zc_value[0] != '\0') {
- if ((error = zfs_secpolicy_write_perms(zc->zc_value,
- ZFS_DELEG_PERM_CLONE, cr)) != 0)
- return (error);
- }
-
- if ((error = zfs_secpolicy_write_perms(parentname,
- ZFS_DELEG_PERM_CREATE, cr)) != 0)
- return (error);
-
- error = zfs_secpolicy_write_perms(parentname,
- ZFS_DELEG_PERM_MOUNT, cr);
-
- return (error);
-}
-
-static int
-zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr)
-{
- int error;
-
- error = secpolicy_fs_unmount(cr, NULL);
- if (error) {
- error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr);
- }
- return (error);
-}
-
-/*
- * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
- * SYS_CONFIG privilege, which is not available in a local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
-{
- if (secpolicy_sys_config(cr, B_FALSE) != 0)
- return (EPERM);
-
- return (0);
-}
-
-/*
- * Just like zfs_secpolicy_config, except that we will check for
- * mount permission on the dataset for permission to create/remove
- * the minor nodes.
- */
-static int
-zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
-{
- if (secpolicy_sys_config(cr, B_FALSE) != 0) {
- return (dsl_deleg_access(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr));
- }
-
- return (0);
-}
-
-/*
- * Policy for fault injection. Requires all privileges.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr)
-{
- return (secpolicy_zinject(cr));
-}
-
-static int
-zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
-{
- zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
-
- if (prop == ZPROP_INVAL) {
- if (!zfs_prop_user(zc->zc_value))
- return (EINVAL);
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_USERPROP, cr));
- } else {
- if (!zfs_prop_inheritable(prop))
- return (EINVAL);
- return (zfs_secpolicy_setprop(zc->zc_name, prop, cr));
- }
-}
-
-/*
- * Returns the nvlist as specified by the user in the zfs_cmd_t.
- */
-static int
-get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
-{
- char *packed;
- int error;
- nvlist_t *list = NULL;
-
- /*
- * Read in and unpack the user-supplied nvlist.
- */
- if (size == 0)
- return (EINVAL);
-
- packed = kmem_alloc(size, KM_SLEEP);
-
- if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) {
- kmem_free(packed, size);
- return (error);
- }
-
- if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
- kmem_free(packed, size);
- return (error);
- }
-
- kmem_free(packed, size);
-
- *nvp = list;
- return (0);
-}
-
-static int
-put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
-{
- char *packed = NULL;
- size_t size;
- int error;
-
- VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
-
- if (size > zc->zc_nvlist_dst_size) {
- error = ENOMEM;
- } else {
- packed = kmem_alloc(size, KM_SLEEP);
- VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
- KM_SLEEP) == 0);
- error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
- size);
- kmem_free(packed, size);
- }
-
- zc->zc_nvlist_dst_size = size;
- return (error);
-}
-
-static int
-zfs_ioc_pool_create(zfs_cmd_t *zc)
-{
- int error;
- nvlist_t *config, *props = NULL;
- char *buf;
-
- if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config))
- return (error);
-
- if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
- nvlist_free(config);
- return (error);
- }
-
- buf = history_str_get(zc);
-
- error = spa_create(zc->zc_name, config, props, buf);
-
- if (buf != NULL)
- history_str_free(buf);
-
- nvlist_free(config);
-
- if (props)
- nvlist_free(props);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_destroy(zfs_cmd_t *zc)
-{
- int error;
- zfs_log_history(zc);
- error = spa_destroy(zc->zc_name);
- return (error);
-}
-
-static int
-zfs_ioc_pool_import(zfs_cmd_t *zc)
-{
- int error;
- nvlist_t *config, *props = NULL;
- uint64_t guid;
-
- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) != 0)
- return (error);
-
- if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
- nvlist_free(config);
- return (error);
- }
-
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
- guid != zc->zc_guid)
- error = EINVAL;
- else
- error = spa_import(zc->zc_name, config, props);
-
- nvlist_free(config);
-
- if (props)
- nvlist_free(props);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_export(zfs_cmd_t *zc)
-{
- int error;
- zfs_log_history(zc);
- error = spa_export(zc->zc_name, NULL);
- return (error);
-}
-
-static int
-zfs_ioc_pool_configs(zfs_cmd_t *zc)
-{
- nvlist_t *configs;
- int error;
-
- if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
- return (EEXIST);
-
- error = put_nvlist(zc, configs);
-
- nvlist_free(configs);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_stats(zfs_cmd_t *zc)
-{
- nvlist_t *config;
- int error;
- int ret = 0;
-
- error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
- sizeof (zc->zc_value));
-
- if (config != NULL) {
- ret = put_nvlist(zc, config);
- nvlist_free(config);
-
- /*
- * The config may be present even if 'error' is non-zero.
- * In this case we return success, and preserve the real errno
- * in 'zc_cookie'.
- */
- zc->zc_cookie = error;
- } else {
- ret = error;
- }
-
- return (ret);
-}
-
-/*
- * Try to import the given pool, returning pool stats as appropriate so that
- * user land knows which devices are available and overall pool health.
- */
-static int
-zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
-{
- nvlist_t *tryconfig, *config;
- int error;
-
- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &tryconfig)) != 0)
- return (error);
-
- config = spa_tryimport(tryconfig);
-
- nvlist_free(tryconfig);
-
- if (config == NULL)
- return (EINVAL);
-
- error = put_nvlist(zc, config);
- nvlist_free(config);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- mutex_enter(&spa_namespace_lock);
- error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
- mutex_exit(&spa_namespace_lock);
-
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_freeze(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error == 0) {
- spa_freeze(spa);
- spa_close(spa, FTAG);
- }
- return (error);
-}
-
-static int
-zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) {
- spa_close(spa, FTAG);
- return (EINVAL);
- }
-
- spa_upgrade(spa, zc->zc_cookie);
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_get_history(zfs_cmd_t *zc)
-{
- spa_t *spa;
- char *hist_buf;
- uint64_t size;
- int error;
-
- if ((size = zc->zc_history_len) == 0)
- return (EINVAL);
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
- spa_close(spa, FTAG);
- return (ENOTSUP);
- }
-
- hist_buf = kmem_alloc(size, KM_SLEEP);
- if ((error = spa_history_get(spa, &zc->zc_history_offset,
- &zc->zc_history_len, hist_buf)) == 0) {
- error = xcopyout(hist_buf,
- (char *)(uintptr_t)zc->zc_history,
- zc->zc_history_len);
- }
-
- spa_close(spa, FTAG);
- kmem_free(hist_buf, size);
- return (error);
-}
-
-static int
-zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
-{
- int error;
-
- if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
- return (error);
-
- return (0);
-}
-
-static int
-zfs_ioc_obj_to_path(zfs_cmd_t *zc)
-{
- objset_t *osp;
- int error;
-
- if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
- DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
- return (error);
-
- error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
- sizeof (zc->zc_value));
- dmu_objset_close(osp);
-
- return (error);
-}
-
-static int
-zfs_ioc_vdev_add(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- nvlist_t *config, **l2cache, **spares;
- uint_t nl2cache = 0, nspares = 0;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error != 0)
- return (error);
-
- error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config);
- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
- &l2cache, &nl2cache);
-
- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
- &spares, &nspares);
-
- /*
- * A root pool with concatenated devices is not supported.
- * Thus, can not add a device to a root pool.
- *
- * Intent log device can not be added to a rootpool because
- * during mountroot, zil is replayed, a seperated log device
- * can not be accessed during the mountroot time.
- *
- * l2cache and spare devices are ok to be added to a rootpool.
- */
- if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
- spa_close(spa, FTAG);
- return (EDOM);
- }
-
- if (error == 0) {
- error = spa_vdev_add(spa, config);
- nvlist_free(config);
- }
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_remove(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error != 0)
- return (error);
- error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- vdev_state_t newstate = VDEV_STATE_UNKNOWN;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
- switch (zc->zc_cookie) {
- case VDEV_STATE_ONLINE:
- error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
- break;
-
- case VDEV_STATE_OFFLINE:
- error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
- break;
-
- case VDEV_STATE_FAULTED:
- error = vdev_fault(spa, zc->zc_guid);
- break;
-
- case VDEV_STATE_DEGRADED:
- error = vdev_degrade(spa, zc->zc_guid);
- break;
-
- default:
- error = EINVAL;
- }
- zc->zc_cookie = newstate;
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_attach(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int replacing = zc->zc_cookie;
- nvlist_t *config;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) == 0) {
- error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
- nvlist_free(config);
- }
-
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_detach(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
-
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
-{
- spa_t *spa;
- char *path = zc->zc_value;
- uint64_t guid = zc->zc_guid;
- int error;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error != 0)
- return (error);
-
- error = spa_vdev_setpath(spa, guid, path);
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_os_open_retry(char *name, objset_t **os)
-{
- int error;
-
-retry:
- error = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, os);
- if (error != 0) {
- /*
- * This is ugly: dmu_objset_open() can return EBUSY if
- * the objset is held exclusively. Fortunately this hold is
- * only for a short while, so we retry here.
- * This avoids user code having to handle EBUSY,
- * for example for a "zfs list".
- */
- if (error == EBUSY) {
- delay(1);
- goto retry;
- }
- }
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_nvlist_dst_size size of buffer for property nvlist
- *
- * outputs:
- * zc_objset_stats stats
- * zc_nvlist_dst property nvlist
- * zc_nvlist_dst_size size of property nvlist
- * zc_value alternate root
- */
-static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
-{
- objset_t *os = NULL;
- int error;
- nvlist_t *nv;
-
- if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0)
- return (error);
-
- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
-
- if (zc->zc_nvlist_dst != 0 &&
- (error = dsl_prop_get_all(os, &nv)) == 0) {
- dmu_objset_stats(os, nv);
- /*
- * NB: zvol_get_stats() will read the objset contents,
- * which we aren't supposed to do with a
- * DS_MODE_STANDARD open, because it could be
- * inconsistent. So this is a bit of a workaround...
- */
- if (!zc->zc_objset_stats.dds_inconsistent) {
- if (dmu_objset_type(os) == DMU_OST_ZVOL)
- VERIFY(zvol_get_stats(os, nv) == 0);
- }
- error = put_nvlist(zc, nv);
- nvlist_free(nv);
- }
-
- spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
-
- dmu_objset_close(os);
- return (error);
-}
-
-static int
-nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
-{
- uint64_t value;
- int error;
-
- /*
- * zfs_get_zplprop() will either find a value or give us
- * the default value (if there is one).
- */
- if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
- return (error);
- VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
- return (0);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_nvlist_dst_size size of buffer for zpl property nvlist
- *
- * outputs:
- * zc_nvlist_dst zpl property nvlist
- * zc_nvlist_dst_size size of zpl property nvlist
- */
-static int
-zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
-{
- objset_t *os;
- int err;
-
- if ((err = zfs_os_open_retry(zc->zc_name, &os)) != 0)
- return (err);
-
- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
-
- /*
- * NB: nvl_add_zplprop() will read the objset contents,
- * which we aren't supposed to do with a DS_MODE_STANDARD
- * open, because it could be inconsistent.
- */
- if (zc->zc_nvlist_dst != NULL &&
- !zc->zc_objset_stats.dds_inconsistent &&
- dmu_objset_type(os) == DMU_OST_ZFS) {
- nvlist_t *nv;
-
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
- (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
- (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
- (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
- err = put_nvlist(zc, nv);
- nvlist_free(nv);
- } else {
- err = ENOENT;
- }
- dmu_objset_close(os);
- return (err);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_cookie zap cursor
- * zc_nvlist_dst_size size of buffer for property nvlist
- *
- * outputs:
- * zc_name name of next filesystem
- * zc_objset_stats stats
- * zc_nvlist_dst property nvlist
- * zc_nvlist_dst_size size of property nvlist
- * zc_value alternate root
- */
-static int
-zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
-{
- objset_t *os;
- int error;
- char *p;
-
- if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) {
- if (error == ENOENT)
- error = ESRCH;
- return (error);
- }
-
- p = strrchr(zc->zc_name, '/');
- if (p == NULL || p[1] != '\0')
- (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
- p = zc->zc_name + strlen(zc->zc_name);
-
- do {
- error = dmu_dir_list_next(os,
- sizeof (zc->zc_name) - (p - zc->zc_name), p,
- NULL, &zc->zc_cookie);
- if (error == ENOENT)
- error = ESRCH;
- } while (error == 0 && !INGLOBALZONE(curproc) &&
- !zone_dataset_visible(zc->zc_name, NULL));
-
- /*
- * If it's a hidden dataset (ie. with a '$' in its name), don't
- * try to get stats for it. Userland will skip over it.
- */
- if (error == 0 && strchr(zc->zc_name, '$') == NULL)
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
- dmu_objset_close(os);
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_cookie zap cursor
- * zc_nvlist_dst_size size of buffer for property nvlist
- *
- * outputs:
- * zc_name name of next snapshot
- * zc_objset_stats stats
- * zc_nvlist_dst property nvlist
- * zc_nvlist_dst_size size of property nvlist
- * zc_value alternate root
- */
-static int
-zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
-{
- objset_t *os;
- int error;
-
- if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) {
- if (error == ENOENT)
- error = ESRCH;
- return (error);
- }
-
- /*
- * A dataset name of maximum length cannot have any snapshots,
- * so exit immediately.
- */
- if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
- dmu_objset_close(os);
- return (ESRCH);
- }
-
- error = dmu_snapshot_list_next(os,
- sizeof (zc->zc_name) - strlen(zc->zc_name),
- zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
- if (error == ENOENT)
- error = ESRCH;
-
- if (error == 0)
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
- /* if we failed, undo the @ that we tacked on to zc_name */
- if (error != 0)
- *strchr(zc->zc_name, '@') = '\0';
-
- dmu_objset_close(os);
- return (error);
-}
-
-int
-zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
-{
- nvpair_t *elem;
- int error;
- uint64_t intval;
- char *strval;
-
- /*
- * First validate permission to set all of the properties
- */
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
- zfs_prop_t prop = zfs_name_to_prop(propname);
-
- if (prop == ZPROP_INVAL) {
- /*
- * If this is a user-defined property, it must be a
- * string, and there is no further validation to do.
- */
- if (!zfs_prop_user(propname) ||
- nvpair_type(elem) != DATA_TYPE_STRING)
- return (EINVAL);
-
- if (error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_USERPROP, CRED()))
- return (error);
- continue;
- }
-
- if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
- return (error);
-
- /*
- * Check that this value is valid for this pool version
- */
- switch (prop) {
- case ZFS_PROP_COMPRESSION:
- /*
- * If the user specified gzip compression, make sure
- * the SPA supports it. We ignore any errors here since
- * we'll catch them later.
- */
- if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(elem, &intval) == 0 &&
- intval >= ZIO_COMPRESS_GZIP_1 &&
- intval <= ZIO_COMPRESS_GZIP_9) {
- if (zfs_check_version(name,
- SPA_VERSION_GZIP_COMPRESSION))
- return (ENOTSUP);
- }
- break;
-
- case ZFS_PROP_COPIES:
- if (zfs_check_version(name, SPA_VERSION_DITTO_BLOCKS))
- return (ENOTSUP);
- break;
-
- case ZFS_PROP_SHARESMB:
- if (zpl_check_version(name, ZPL_VERSION_FUID))
- return (ENOTSUP);
- break;
- }
- if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
- return (error);
- }
-
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
- zfs_prop_t prop = zfs_name_to_prop(propname);
-
- if (prop == ZPROP_INVAL) {
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- error = dsl_prop_set(name, propname, 1,
- strlen(strval) + 1, strval);
- if (error == 0)
- continue;
- else
- return (error);
- }
-
- switch (prop) {
- case ZFS_PROP_QUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_quota(name, intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_REFQUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dataset_set_quota(name, intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_RESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_reservation(name,
- intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_REFRESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dataset_set_reservation(name,
- intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_VOLSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volsize(name,
- ddi_driver_major(zfs_dip), intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_VOLBLOCKSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volblocksize(name, intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_VERSION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zfs_set_version(name, intval)) != 0)
- return (error);
- break;
-
- default:
- if (nvpair_type(elem) == DATA_TYPE_STRING) {
- if (zfs_prop_get_type(prop) !=
- PROP_TYPE_STRING)
- return (EINVAL);
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- if ((error = dsl_prop_set(name,
- nvpair_name(elem), 1, strlen(strval) + 1,
- strval)) != 0)
- return (error);
- } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
- const char *unused;
-
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
-
- switch (zfs_prop_get_type(prop)) {
- case PROP_TYPE_NUMBER:
- break;
- case PROP_TYPE_STRING:
- return (EINVAL);
- case PROP_TYPE_INDEX:
- if (zfs_prop_index_to_string(prop,
- intval, &unused) != 0)
- return (EINVAL);
- break;
- default:
- cmn_err(CE_PANIC,
- "unknown property type");
- break;
- }
-
- if ((error = dsl_prop_set(name, propname,
- 8, 1, &intval)) != 0)
- return (error);
- } else {
- return (EINVAL);
- }
- break;
- }
- }
-
- return (0);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_value name of property to inherit
- * zc_nvlist_src{_size} nvlist of properties to apply
- *
- * outputs: none
- */
-static int
-zfs_ioc_set_prop(zfs_cmd_t *zc)
-{
- nvlist_t *nvl;
- int error;
-
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvl)) != 0)
- return (error);
-
- error = zfs_set_prop_nvlist(zc->zc_name, nvl);
-
- nvlist_free(nvl);
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_value name of property to inherit
- *
- * outputs: none
- */
-static int
-zfs_ioc_inherit_prop(zfs_cmd_t *zc)
-{
- /* the property name has been validated by zfs_secpolicy_inherit() */
- return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
-}
-
-static int
-zfs_ioc_pool_set_props(zfs_cmd_t *zc)
-{
- nvlist_t *props;
- spa_t *spa;
- int error;
-
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)))
- return (error);
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
- nvlist_free(props);
- return (error);
- }
-
- error = spa_prop_set(spa, props);
-
- nvlist_free(props);
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_get_props(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- nvlist_t *nvp = NULL;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_prop_get(spa, &nvp);
-
- if (error == 0 && zc->zc_nvlist_dst != NULL)
- error = put_nvlist(zc, nvp);
- else
- error = EFAULT;
-
- spa_close(spa, FTAG);
-
- if (nvp)
- nvlist_free(nvp);
- return (error);
-}
-
-static int
-zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
-{
- nvlist_t *nvp;
- int error;
- uint32_t uid;
- uint32_t gid;
- uint32_t *groups;
- uint_t group_cnt;
- cred_t *usercred;
-
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvp)) != 0) {
- return (error);
- }
-
- if ((error = nvlist_lookup_uint32(nvp,
- ZFS_DELEG_PERM_UID, &uid)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
-
- if ((error = nvlist_lookup_uint32(nvp,
- ZFS_DELEG_PERM_GID, &gid)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
-
- if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS,
- &groups, &group_cnt)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
- usercred = cralloc();
- if ((crsetugid(usercred, uid, gid) != 0) ||
- (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) {
- nvlist_free(nvp);
- crfree(usercred);
- return (EPERM);
- }
- nvlist_free(nvp);
- error = dsl_deleg_access(zc->zc_name,
- zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
- crfree(usercred);
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_nvlist_src{_size} nvlist of delegated permissions
- * zc_perm_action allow/unallow flag
- *
- * outputs: none
- */
-static int
-zfs_ioc_set_fsacl(zfs_cmd_t *zc)
-{
- int error;
- nvlist_t *fsaclnv = NULL;
-
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &fsaclnv)) != 0)
- return (error);
-
- /*
- * Verify nvlist is constructed correctly
- */
- if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
- nvlist_free(fsaclnv);
- return (EINVAL);
- }
-
- /*
- * If we don't have PRIV_SYS_MOUNT, then validate
- * that user is allowed to hand out each permission in
- * the nvlist(s)
- */
-
- error = secpolicy_zfs(CRED());
- if (error) {
- if (zc->zc_perm_action == B_FALSE) {
- error = dsl_deleg_can_allow(zc->zc_name,
- fsaclnv, CRED());
- } else {
- error = dsl_deleg_can_unallow(zc->zc_name,
- fsaclnv, CRED());
- }
- }
-
- if (error == 0)
- error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
-
- nvlist_free(fsaclnv);
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- *
- * outputs:
- * zc_nvlist_src{_size} nvlist of delegated permissions
- */
-static int
-zfs_ioc_get_fsacl(zfs_cmd_t *zc)
-{
- nvlist_t *nvp;
- int error;
-
- if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
- error = put_nvlist(zc, nvp);
- nvlist_free(nvp);
- }
-
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of volume
- *
- * outputs: none
- */
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
- return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
-}
-
-/*
- * inputs:
- * zc_name name of volume
- *
- * outputs: none
- */
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
- return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
- * Search the vfs list for a specified resource. Returns a pointer to it
- * or NULL if no suitable entry is found. The caller of this routine
- * is responsible for releasing the returned vfs pointer.
- */
-static vfs_t *
-zfs_get_vfs(const char *resource)
-{
- struct vfs *vfsp;
- struct vfs *vfs_found = NULL;
-
- vfs_list_read_lock();
- vfsp = rootvfs;
- do {
- if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
- VFS_HOLD(vfsp);
- vfs_found = vfsp;
- break;
- }
- vfsp = vfsp->vfs_next;
- } while (vfsp != rootvfs);
- vfs_list_unlock();
- return (vfs_found);
-}
-
-/* ARGSUSED */
-static void
-zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
-{
- zfs_creat_t *zct = arg;
-
- zfs_create_fs(os, cr, zct->zct_zplprops, tx);
-}
-
-#define ZFS_PROP_UNDEFINED ((uint64_t)-1)
-
-/*
- * inputs:
- * createprops list of properties requested by creator
- * dataset name of dataset we are creating
- *
- * outputs:
- * zplprops values for the zplprops we attach to the master node object
- *
- * Determine the settings for utf8only, normalization and
- * casesensitivity. Specific values may have been requested by the
- * creator and/or we can inherit values from the parent dataset. If
- * the file system is of too early a vintage, a creator can not
- * request settings for these properties, even if the requested
- * setting is the default value. We don't actually want to create dsl
- * properties for these, so remove them from the source nvlist after
- * processing.
- */
-static int
-zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
- nvlist_t *zplprops, uint64_t zplver, boolean_t *is_ci)
-{
- objset_t *os;
- char parentname[MAXNAMELEN];
- char *cp;
- uint64_t sense = ZFS_PROP_UNDEFINED;
- uint64_t norm = ZFS_PROP_UNDEFINED;
- uint64_t u8 = ZFS_PROP_UNDEFINED;
- int error = 0;
-
- ASSERT(zplprops != NULL);
-
- (void) strlcpy(parentname, dataset, sizeof (parentname));
- cp = strrchr(parentname, '/');
- ASSERT(cp != NULL);
- cp[0] = '\0';
-
- /*
- * Pull out creator prop choices, if any.
- */
- if (createprops) {
- (void) nvlist_lookup_uint64(createprops,
- zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
- (void) nvlist_remove_all(createprops,
- zfs_prop_to_name(ZFS_PROP_NORMALIZE));
- (void) nvlist_lookup_uint64(createprops,
- zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
- (void) nvlist_remove_all(createprops,
- zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
- (void) nvlist_lookup_uint64(createprops,
- zfs_prop_to_name(ZFS_PROP_CASE), &sense);
- (void) nvlist_remove_all(createprops,
- zfs_prop_to_name(ZFS_PROP_CASE));
- }
-
- /*
- * If the file system or pool is version is too "young" to
- * support normalization and the creator tried to set a value
- * for one of the props, error out. We only need check the
- * ZPL version because we've already checked by now that the
- * SPA version is compatible with the selected ZPL version.
- */
- if (zplver < ZPL_VERSION_NORMALIZATION &&
- (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
- sense != ZFS_PROP_UNDEFINED))
- return (ENOTSUP);
-
- /*
- * Put the version in the zplprops
- */
- VERIFY(nvlist_add_uint64(zplprops,
- zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
-
- /*
- * Open parent object set so we can inherit zplprop values if
- * necessary.
- */
- if ((error = zfs_os_open_retry(parentname, &os)) != 0)
- return (error);
-
- if (norm == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
- VERIFY(nvlist_add_uint64(zplprops,
- zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
-
- /*
- * If we're normalizing, names must always be valid UTF-8 strings.
- */
- if (norm)
- u8 = 1;
- if (u8 == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
- VERIFY(nvlist_add_uint64(zplprops,
- zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
-
- if (sense == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
- VERIFY(nvlist_add_uint64(zplprops,
- zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
-
- if (is_ci)
- *is_ci = (sense == ZFS_CASE_INSENSITIVE);
-
- dmu_objset_close(os);
- return (0);
-}
-
-/*
- * inputs:
- * zc_objset_type type of objset to create (fs vs zvol)
- * zc_name name of new objset
- * zc_value name of snapshot to clone from (may be empty)
- * zc_nvlist_src{_size} nvlist of properties to apply
- *
- * outputs: none
- */
-static int
-zfs_ioc_create(zfs_cmd_t *zc)
-{
- objset_t *clone;
- int error = 0;
- zfs_creat_t zct;
- nvlist_t *nvprops = NULL;
- void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
- dmu_objset_type_t type = zc->zc_objset_type;
-
- switch (type) {
-
- case DMU_OST_ZFS:
- cbfunc = zfs_create_cb;
- break;
-
- case DMU_OST_ZVOL:
- cbfunc = zvol_create_cb;
- break;
-
- default:
- cbfunc = NULL;
- break;
- }
- if (strchr(zc->zc_name, '@') ||
- strchr(zc->zc_name, '%'))
- return (EINVAL);
-
- if (zc->zc_nvlist_src != NULL &&
- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
- return (error);
-
- zct.zct_zplprops = NULL;
- zct.zct_props = nvprops;
-
- if (zc->zc_value[0] != '\0') {
- /*
- * We're creating a clone of an existing snapshot.
- */
- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
- nvlist_free(nvprops);
- return (EINVAL);
- }
-
- error = dmu_objset_open(zc->zc_value, type,
- DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
- if (error) {
- nvlist_free(nvprops);
- return (error);
- }
-
- error = dmu_objset_create(zc->zc_name, type, clone, 0,
- NULL, NULL);
- if (error) {
- dmu_objset_close(clone);
- nvlist_free(nvprops);
- return (error);
- }
- dmu_objset_close(clone);
- } else {
- boolean_t is_insensitive = B_FALSE;
-
- if (cbfunc == NULL) {
- nvlist_free(nvprops);
- return (EINVAL);
- }
-
- if (type == DMU_OST_ZVOL) {
- uint64_t volsize, volblocksize;
-
- if (nvprops == NULL ||
- nvlist_lookup_uint64(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE),
- &volsize) != 0) {
- nvlist_free(nvprops);
- return (EINVAL);
- }
-
- if ((error = nvlist_lookup_uint64(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
- &volblocksize)) != 0 && error != ENOENT) {
- nvlist_free(nvprops);
- return (EINVAL);
- }
-
- if (error != 0)
- volblocksize = zfs_prop_default_numeric(
- ZFS_PROP_VOLBLOCKSIZE);
-
- if ((error = zvol_check_volblocksize(
- volblocksize)) != 0 ||
- (error = zvol_check_volsize(volsize,
- volblocksize)) != 0) {
- nvlist_free(nvprops);
- return (error);
- }
- } else if (type == DMU_OST_ZFS) {
- uint64_t version;
- int error;
-
- /*
- * Default ZPL version to non-FUID capable if the
- * pool is not upgraded to support FUIDs.
- */
- if (zfs_check_version(zc->zc_name, SPA_VERSION_FUID))
- version = ZPL_VERSION_FUID - 1;
- else
- version = ZPL_VERSION;
-
- /*
- * Potentially override default ZPL version based
- * on creator's request.
- */
- (void) nvlist_lookup_uint64(nvprops,
- zfs_prop_to_name(ZFS_PROP_VERSION), &version);
-
- /*
- * Make sure version we ended up with is kosher
- */
- if ((version < ZPL_VERSION_INITIAL ||
- version > ZPL_VERSION) ||
- (version >= ZPL_VERSION_FUID &&
- zfs_check_version(zc->zc_name, SPA_VERSION_FUID))) {
- nvlist_free(nvprops);
- return (ENOTSUP);
- }
-
- /*
- * We have to have normalization and
- * case-folding flags correct when we do the
- * file system creation, so go figure them out
- * now.
- */
- VERIFY(nvlist_alloc(&zct.zct_zplprops,
- NV_UNIQUE_NAME, KM_SLEEP) == 0);
- error = zfs_fill_zplprops(zc->zc_name, nvprops,
- zct.zct_zplprops, version, &is_insensitive);
- if (error != 0) {
- nvlist_free(nvprops);
- nvlist_free(zct.zct_zplprops);
- return (error);
- }
- }
- error = dmu_objset_create(zc->zc_name, type, NULL,
- is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
- nvlist_free(zct.zct_zplprops);
- }
-
- /*
- * It would be nice to do this atomically.
- */
- if (error == 0) {
- if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
- (void) dmu_objset_destroy(zc->zc_name);
- }
- nvlist_free(nvprops);
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_value short name of snapshot
- * zc_cookie recursive flag
- *
- * outputs: none
- */
-static int
-zfs_ioc_snapshot(zfs_cmd_t *zc)
-{
- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
- return (EINVAL);
- return (dmu_objset_snapshot(zc->zc_name,
- zc->zc_value, zc->zc_cookie));
-}
-
-int
-zfs_unmount_snap(char *name, void *arg)
-{
- char *snapname = arg;
- char *cp;
- vfs_t *vfsp = NULL;
-
- /*
- * Snapshots (which are under .zfs control) must be unmounted
- * before they can be destroyed.
- */
-
- if (snapname) {
- (void) strcat(name, "@");
- (void) strcat(name, snapname);
- vfsp = zfs_get_vfs(name);
- cp = strchr(name, '@');
- *cp = '\0';
- } else if (strchr(name, '@')) {
- vfsp = zfs_get_vfs(name);
- }
-
- if (vfsp) {
- /*
- * Always force the unmount for snapshots.
- */
- int flag = MS_FORCE;
- int err;
-
- if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
- VFS_RELE(vfsp);
- return (err);
- }
- VFS_RELE(vfsp);
- if ((err = dounmount(vfsp, flag, kcred)) != 0)
- return (err);
- }
- return (0);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_value short name of snapshot
- *
- * outputs: none
- */
-static int
-zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
-{
- int err;
-
- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
- return (EINVAL);
- err = dmu_objset_find(zc->zc_name,
- zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
- if (err)
- return (err);
- return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
-}
-
-/*
- * inputs:
- * zc_name name of dataset to destroy
- * zc_objset_type type of objset
- *
- * outputs: none
- */
-static int
-zfs_ioc_destroy(zfs_cmd_t *zc)
-{
- if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
- int err = zfs_unmount_snap(zc->zc_name, NULL);
- if (err)
- return (err);
- }
-
- return (dmu_objset_destroy(zc->zc_name));
-}
-
-/*
- * inputs:
- * zc_name name of dataset to rollback (to most recent snapshot)
- *
- * outputs: none
- */
-static int
-zfs_ioc_rollback(zfs_cmd_t *zc)
-{
- objset_t *os;
- int error;
- zfsvfs_t *zfsvfs = NULL;
-
- /*
- * Get the zfsvfs for the receiving objset. There
- * won't be one if we're operating on a zvol, if the
- * objset doesn't exist yet, or is not mounted.
- */
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD, &os);
- if (error)
- return (error);
-
- if (dmu_objset_type(os) == DMU_OST_ZFS) {
- mutex_enter(&os->os->os_user_ptr_lock);
- zfsvfs = dmu_objset_get_user(os);
- if (zfsvfs != NULL)
- VFS_HOLD(zfsvfs->z_vfs);
- mutex_exit(&os->os->os_user_ptr_lock);
- }
-
- if (zfsvfs != NULL) {
- char osname[MAXNAMELEN];
- int mode;
-
- error = zfs_suspend_fs(zfsvfs, osname, &mode);
- if (error == 0) {
- int resume_err;
-
- ASSERT(strcmp(osname, zc->zc_name) == 0);
- error = dmu_objset_rollback(os);
- resume_err = zfs_resume_fs(zfsvfs, osname, mode);
- error = error ? error : resume_err;
- } else {
- dmu_objset_close(os);
- }
- VFS_RELE(zfsvfs->z_vfs);
- } else {
- error = dmu_objset_rollback(os);
- }
- /* Note, the dmu_objset_rollback() closes the objset for us. */
-
- return (error);
-}
-
-/*
- * inputs:
- * zc_name old name of dataset
- * zc_value new name of dataset
- * zc_cookie recursive flag (only valid for snapshots)
- *
- * outputs: none
- */
-static int
-zfs_ioc_rename(zfs_cmd_t *zc)
-{
- boolean_t recursive = zc->zc_cookie & 1;
-
- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
- strchr(zc->zc_value, '%'))
- return (EINVAL);
-
- /*
- * Unmount snapshot unless we're doing a recursive rename,
- * in which case the dataset code figures out which snapshots
- * to unmount.
- */
- if (!recursive && strchr(zc->zc_name, '@') != NULL &&
- zc->zc_objset_type == DMU_OST_ZFS) {
- int err = zfs_unmount_snap(zc->zc_name, NULL);
- if (err)
- return (err);
- }
-
- return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
-}
-
-/*
- * inputs:
- * zc_name name of containing filesystem
- * zc_nvlist_src{_size} nvlist of properties to apply
- * zc_value name of snapshot to create
- * zc_string name of clone origin (if DRR_FLAG_CLONE)
- * zc_cookie file descriptor to recv from
- * zc_begin_record the BEGIN record of the stream (not byteswapped)
- * zc_guid force flag
- *
- * outputs:
- * zc_cookie number of bytes read
- */
-static int
-zfs_ioc_recv(zfs_cmd_t *zc)
-{
- file_t *fp;
- objset_t *os;
- dmu_recv_cookie_t drc;
- zfsvfs_t *zfsvfs = NULL;
- boolean_t force = (boolean_t)zc->zc_guid;
- int error, fd;
- offset_t off;
- nvlist_t *props = NULL;
- objset_t *origin = NULL;
- char *tosnap;
- char tofs[ZFS_MAXNAMELEN];
-
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
- strchr(zc->zc_value, '@') == NULL ||
- strchr(zc->zc_value, '%'))
- return (EINVAL);
-
- (void) strcpy(tofs, zc->zc_value);
- tosnap = strchr(tofs, '@');
- *tosnap = '\0';
- tosnap++;
-
- if (zc->zc_nvlist_src != NULL &&
- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)) != 0)
- return (error);
-
- fd = zc->zc_cookie;
- fp = getf(fd);
- if (fp == NULL) {
- nvlist_free(props);
- return (EBADF);
- }
-
- /*
- * Get the zfsvfs for the receiving objset. There
- * won't be one if we're operating on a zvol, if the
- * objset doesn't exist yet, or is not mounted.
- */
-
- error = dmu_objset_open(tofs, DMU_OST_ZFS,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os);
- if (!error) {
- mutex_enter(&os->os->os_user_ptr_lock);
- zfsvfs = dmu_objset_get_user(os);
- if (zfsvfs != NULL) {
- VFS_HOLD(zfsvfs->z_vfs);
- mutex_exit(&os->os->os_user_ptr_lock);
- if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
- VFS_RELE(zfsvfs->z_vfs);
- dmu_objset_close(os);
- nvlist_free(props);
- releasef(fd);
- return (EBUSY);
- }
- } else {
- mutex_exit(&os->os->os_user_ptr_lock);
- }
- dmu_objset_close(os);
- }
-
- if (zc->zc_string[0]) {
- error = dmu_objset_open(zc->zc_string, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &origin);
- if (error) {
- if (zfsvfs != NULL) {
- mutex_exit(&zfsvfs->z_online_recv_lock);
- VFS_RELE(zfsvfs->z_vfs);
- }
- nvlist_free(props);
- releasef(fd);
- return (error);
- }
- }
-
- error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
- force, origin, zfsvfs != NULL, &drc);
- if (origin)
- dmu_objset_close(origin);
- if (error) {
- if (zfsvfs != NULL) {
- mutex_exit(&zfsvfs->z_online_recv_lock);
- VFS_RELE(zfsvfs->z_vfs);
- }
- nvlist_free(props);
- releasef(fd);
- return (error);
- }
-
- /*
- * If properties are supplied, they are to completely replace
- * the existing ones; "inherit" any existing properties.
- */
- if (props) {
- objset_t *os;
- nvlist_t *nv = NULL;
-
- error = dmu_objset_open(tofs, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- &os);
- if (error == 0) {
- error = dsl_prop_get_all(os, &nv);
- dmu_objset_close(os);
- }
- if (error == 0) {
- nvpair_t *elem;
- zfs_cmd_t *zc2;
- zc2 = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
-
- (void) strcpy(zc2->zc_name, tofs);
- for (elem = nvlist_next_nvpair(nv, NULL); elem;
- elem = nvlist_next_nvpair(nv, elem)) {
- (void) strcpy(zc2->zc_value, nvpair_name(elem));
- if (zfs_secpolicy_inherit(zc2, CRED()) == 0)
- (void) zfs_ioc_inherit_prop(zc2);
- }
- kmem_free(zc2, sizeof (zfs_cmd_t));
- }
- if (nv)
- nvlist_free(nv);
- }
-
- /*
- * Set properties. Note, we ignore errors. Would be better to
- * do best-effort in zfs_set_prop_nvlist, too.
- */
- (void) zfs_set_prop_nvlist(tofs, props);
- nvlist_free(props);
-
- off = fp->f_offset;
- error = dmu_recv_stream(&drc, fp->f_vnode, &off);
-
- if (error == 0) {
- if (zfsvfs != NULL) {
- char osname[MAXNAMELEN];
- int mode;
-
- error = zfs_suspend_fs(zfsvfs, osname, &mode);
- if (error == 0) {
- int resume_err;
-
- error = dmu_recv_end(&drc);
- resume_err = zfs_resume_fs(zfsvfs,
- osname, mode);
- error = error ? error : resume_err;
- } else {
- dmu_recv_abort_cleanup(&drc);
- }
- } else {
- error = dmu_recv_end(&drc);
- }
- }
- if (zfsvfs != NULL) {
- mutex_exit(&zfsvfs->z_online_recv_lock);
- VFS_RELE(zfsvfs->z_vfs);
- }
-
- zc->zc_cookie = off - fp->f_offset;
- if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
- fp->f_offset = off;
-
- releasef(fd);
- return (error);
-}
-
-/*
- * inputs:
- * zc_name name of snapshot to send
- * zc_value short name of incremental fromsnap (may be empty)
- * zc_cookie file descriptor to send stream to
- * zc_obj fromorigin flag (mutually exclusive with zc_value)
- *
- * outputs: none
- */
-static int
-zfs_ioc_send(zfs_cmd_t *zc)
-{
- objset_t *fromsnap = NULL;
- objset_t *tosnap;
- file_t *fp;
- int error;
- offset_t off;
-
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
- if (error)
- return (error);
-
- if (zc->zc_value[0] != '\0') {
- char buf[MAXPATHLEN];
- char *cp;
-
- (void) strncpy(buf, zc->zc_name, sizeof (buf));
- cp = strchr(buf, '@');
- if (cp)
- *(cp+1) = 0;
- (void) strncat(buf, zc->zc_value, sizeof (buf));
- error = dmu_objset_open(buf, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
- if (error) {
- dmu_objset_close(tosnap);
- return (error);
- }
- }
-
- fp = getf(zc->zc_cookie);
- if (fp == NULL) {
- dmu_objset_close(tosnap);
- if (fromsnap)
- dmu_objset_close(fromsnap);
- return (EBADF);
- }
-
- off = fp->f_offset;
- error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off);
-
- if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
- fp->f_offset = off;
- releasef(zc->zc_cookie);
- if (fromsnap)
- dmu_objset_close(fromsnap);
- dmu_objset_close(tosnap);
- return (error);
-}
-
-static int
-zfs_ioc_inject_fault(zfs_cmd_t *zc)
-{
- int id, error;
-
- error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
- &zc->zc_inject_record);
-
- if (error == 0)
- zc->zc_guid = (uint64_t)id;
-
- return (error);
-}
-
-static int
-zfs_ioc_clear_fault(zfs_cmd_t *zc)
-{
- return (zio_clear_fault((int)zc->zc_guid));
-}
-
-static int
-zfs_ioc_inject_list_next(zfs_cmd_t *zc)
-{
- int id = (int)zc->zc_guid;
- int error;
-
- error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
- &zc->zc_inject_record);
-
- zc->zc_guid = id;
-
- return (error);
-}
-
-static int
-zfs_ioc_error_log(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- size_t count = (size_t)zc->zc_nvlist_dst_size;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
- &count);
- if (error == 0)
- zc->zc_nvlist_dst_size = count;
- else
- zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
-
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_clear(zfs_cmd_t *zc)
-{
- spa_t *spa;
- vdev_t *vd;
- uint64_t txg;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- /*
- * Try to resume any I/Os which may have been suspended
- * as a result of a complete pool failure.
- */
- if (!list_is_empty(&spa->spa_zio_list)) {
- if (zio_vdev_resume_io(spa) != 0) {
- spa_close(spa, FTAG);
- return (EIO);
- }
- }
-
- txg = spa_vdev_enter(spa);
-
- if (zc->zc_guid == 0) {
- vd = NULL;
- } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
- spa_aux_vdev_t *sav;
- int i;
-
- /*
- * Check if this is an l2cache device.
- */
- ASSERT(spa != NULL);
- sav = &spa->spa_l2cache;
- for (i = 0; i < sav->sav_count; i++) {
- if (sav->sav_vdevs[i]->vdev_guid == zc->zc_guid) {
- vd = sav->sav_vdevs[i];
- break;
- }
- }
-
- if (vd == NULL) {
- (void) spa_vdev_exit(spa, NULL, txg, ENODEV);
- spa_close(spa, FTAG);
- return (ENODEV);
- }
- }
-
- vdev_clear(spa, vd, B_TRUE);
-
- (void) spa_vdev_exit(spa, NULL, txg, 0);
-
- spa_close(spa, FTAG);
-
- return (0);
-}
-
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_value name of origin snapshot
- *
- * outputs: none
- */
-static int
-zfs_ioc_promote(zfs_cmd_t *zc)
-{
- char *cp;
-
- /*
- * We don't need to unmount *all* the origin fs's snapshots, but
- * it's easier.
- */
- cp = strchr(zc->zc_value, '@');
- if (cp)
- *cp = '\0';
- (void) dmu_objset_find(zc->zc_value,
- zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
- return (dsl_dataset_promote(zc->zc_name));
-}
-
-/*
- * We don't want to have a hard dependency
- * against some special symbols in sharefs
- * nfs, and smbsrv. Determine them if needed when
- * the first file system is shared.
- * Neither sharefs, nfs or smbsrv are unloadable modules.
- */
-int (*znfsexport_fs)(void *arg);
-int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
-int (*zsmbexport_fs)(void *arg, boolean_t add_share);
-
-int zfs_nfsshare_inited;
-int zfs_smbshare_inited;
-
-ddi_modhandle_t nfs_mod;
-ddi_modhandle_t sharefs_mod;
-ddi_modhandle_t smbsrv_mod;
-kmutex_t zfs_share_lock;
-
-static int
-zfs_init_sharefs()
-{
- int error;
-
- ASSERT(MUTEX_HELD(&zfs_share_lock));
- /* Both NFS and SMB shares also require sharetab support. */
- if (sharefs_mod == NULL && ((sharefs_mod =
- ddi_modopen("fs/sharefs",
- KRTLD_MODE_FIRST, &error)) == NULL)) {
- return (ENOSYS);
- }
- if (zshare_fs == NULL && ((zshare_fs =
- (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
- ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
- return (ENOSYS);
- }
- return (0);
-}
-
-static int
-zfs_ioc_share(zfs_cmd_t *zc)
-{
- int error;
- int opcode;
-
- switch (zc->zc_share.z_sharetype) {
- case ZFS_SHARE_NFS:
- case ZFS_UNSHARE_NFS:
- if (zfs_nfsshare_inited == 0) {
- mutex_enter(&zfs_share_lock);
- if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
- KRTLD_MODE_FIRST, &error)) == NULL)) {
- mutex_exit(&zfs_share_lock);
- return (ENOSYS);
- }
- if (znfsexport_fs == NULL &&
- ((znfsexport_fs = (int (*)(void *))
- ddi_modsym(nfs_mod,
- "nfs_export", &error)) == NULL)) {
- mutex_exit(&zfs_share_lock);
- return (ENOSYS);
- }
- error = zfs_init_sharefs();
- if (error) {
- mutex_exit(&zfs_share_lock);
- return (ENOSYS);
- }
- zfs_nfsshare_inited = 1;
- mutex_exit(&zfs_share_lock);
- }
- break;
- case ZFS_SHARE_SMB:
- case ZFS_UNSHARE_SMB:
- if (zfs_smbshare_inited == 0) {
- mutex_enter(&zfs_share_lock);
- if (smbsrv_mod == NULL && ((smbsrv_mod =
- ddi_modopen("drv/smbsrv",
- KRTLD_MODE_FIRST, &error)) == NULL)) {
- mutex_exit(&zfs_share_lock);
- return (ENOSYS);
- }
- if (zsmbexport_fs == NULL && ((zsmbexport_fs =
- (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
- "smb_server_share", &error)) == NULL)) {
- mutex_exit(&zfs_share_lock);
- return (ENOSYS);
- }
- error = zfs_init_sharefs();
- if (error) {
- mutex_exit(&zfs_share_lock);
- return (ENOSYS);
- }
- zfs_smbshare_inited = 1;
- mutex_exit(&zfs_share_lock);
- }
- break;
- default:
- return (EINVAL);
- }
-
- switch (zc->zc_share.z_sharetype) {
- case ZFS_SHARE_NFS:
- case ZFS_UNSHARE_NFS:
- if (error =
- znfsexport_fs((void *)
- (uintptr_t)zc->zc_share.z_exportdata))
- return (error);
- break;
- case ZFS_SHARE_SMB:
- case ZFS_UNSHARE_SMB:
- if (error = zsmbexport_fs((void *)
- (uintptr_t)zc->zc_share.z_exportdata,
- zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
- B_TRUE : B_FALSE)) {
- return (error);
- }
- break;
- }
-
- opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
- zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
- SHAREFS_ADD : SHAREFS_REMOVE;
-
- /*
- * Add or remove share from sharetab
- */
- error = zshare_fs(opcode,
- (void *)(uintptr_t)zc->zc_share.z_sharedata,
- zc->zc_share.z_sharemax);
-
- return (error);
-
-}
-
-/*
- * pool create, destroy, and export don't log the history as part of
- * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
- * do the logging of those commands.
- */
-static zfs_ioc_vec_t zfs_ioc_vec[] = {
- { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE },
- { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE },
- { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE },
- { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
- { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
- { zfs_ioc_dataset_list_next, zfs_secpolicy_read,
- DATASET_NAME, B_FALSE },
- { zfs_ioc_snapshot_list_next, zfs_secpolicy_read,
- DATASET_NAME, B_FALSE },
- { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE },
- { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
- { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
- { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE },
- { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
- { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE },
- { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE },
- { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE },
- { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE },
- { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
- { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
- { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE },
- { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE },
- { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE },
- { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
- { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE },
- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE },
- { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE },
- { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE },
- { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
- { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi,
- DATASET_NAME, B_FALSE },
- { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE },
- { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE },
-};
-
-static int
-zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
-{
- zfs_cmd_t *zc;
- uint_t vec;
- int error, rc;
-
- if (getminor(dev) != 0)
- return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
-
- vec = cmd - ZFS_IOC;
- ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
-
- if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
- return (EINVAL);
-
- zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
-
- error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t));
-
- if (error == 0)
- error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr);
-
- /*
- * Ensure that all pool/dataset names are valid before we pass down to
- * the lower layers.
- */
- if (error == 0) {
- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
- switch (zfs_ioc_vec[vec].zvec_namecheck) {
- case POOL_NAME:
- if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
- error = EINVAL;
- break;
-
- case DATASET_NAME:
- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
- error = EINVAL;
- break;
-
- case NO_NAME:
- break;
- }
- }
-
- if (error == 0)
- error = zfs_ioc_vec[vec].zvec_func(zc);
-
- rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t));
- if (error == 0) {
- error = rc;
- if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE)
- zfs_log_history(zc);
- }
-
- kmem_free(zc, sizeof (zfs_cmd_t));
- return (error);
-}
-
-static int
-zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
- if (cmd != DDI_ATTACH)
- return (DDI_FAILURE);
-
- if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
- DDI_PSEUDO, 0) == DDI_FAILURE)
- return (DDI_FAILURE);
-
- zfs_dip = dip;
-
- ddi_report_dev(dip);
-
- return (DDI_SUCCESS);
-}
-
-static int
-zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
- if (spa_busy() || zfs_busy() || zvol_busy())
- return (DDI_FAILURE);
-
- if (cmd != DDI_DETACH)
- return (DDI_FAILURE);
-
- zfs_dip = NULL;
-
- ddi_prop_remove_all(dip);
- ddi_remove_minor_node(dip, NULL);
-
- return (DDI_SUCCESS);
-}
-
-/*ARGSUSED*/
-static int
-zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
-{
- switch (infocmd) {
- case DDI_INFO_DEVT2DEVINFO:
- *result = zfs_dip;
- return (DDI_SUCCESS);
-
- case DDI_INFO_DEVT2INSTANCE:
- *result = (void *)0;
- return (DDI_SUCCESS);
- }
-
- return (DDI_FAILURE);
-}
-
-/*
- * OK, so this is a little weird.
- *
- * /dev/zfs is the control node, i.e. minor 0.
- * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
- *
- * /dev/zfs has basically nothing to do except serve up ioctls,
- * so most of the standard driver entry points are in zvol.c.
- */
-static struct cb_ops zfs_cb_ops = {
- zvol_open, /* open */
- zvol_close, /* close */
- zvol_strategy, /* strategy */
- nodev, /* print */
- zvol_dump, /* dump */
- zvol_read, /* read */
- zvol_write, /* write */
- zfsdev_ioctl, /* ioctl */
- nodev, /* devmap */
- nodev, /* mmap */
- nodev, /* segmap */
- nochpoll, /* poll */
- ddi_prop_op, /* prop_op */
- NULL, /* streamtab */
- D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
- CB_REV, /* version */
- nodev, /* async read */
- nodev, /* async write */
-};
-
-static struct dev_ops zfs_dev_ops = {
- DEVO_REV, /* version */
- 0, /* refcnt */
- zfs_info, /* info */
- nulldev, /* identify */
- nulldev, /* probe */
- zfs_attach, /* attach */
- zfs_detach, /* detach */
- nodev, /* reset */
- &zfs_cb_ops, /* driver operations */
- NULL /* no bus operations */
-};
-
-static struct modldrv zfs_modldrv = {
- &mod_driverops, "ZFS storage pool version " SPA_VERSION_STRING,
- &zfs_dev_ops
-};
-
-static struct modlinkage modlinkage = {
- MODREV_1,
- (void *)&zfs_modlfs,
- (void *)&zfs_modldrv,
- NULL
-};
-
-
-uint_t zfs_fsyncer_key;
-extern uint_t rrw_tsd_key;
-
-int
-_init(void)
-{
- int error;
-
- spa_init(FREAD | FWRITE);
- zfs_init();
- zvol_init();
-
- if ((error = mod_install(&modlinkage)) != 0) {
- zvol_fini();
- zfs_fini();
- spa_fini();
- return (error);
- }
-
- tsd_create(&zfs_fsyncer_key, NULL);
- tsd_create(&rrw_tsd_key, NULL);
-
- error = ldi_ident_from_mod(&modlinkage, &zfs_li);
- ASSERT(error == 0);
- mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
-
- return (0);
-}
-
-int
-_fini(void)
-{
- int error;
-
- if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
- return (EBUSY);
-
- if ((error = mod_remove(&modlinkage)) != 0)
- return (error);
-
- zvol_fini();
- zfs_fini();
- spa_fini();
- if (zfs_nfsshare_inited)
- (void) ddi_modclose(nfs_mod);
- if (zfs_smbshare_inited)
- (void) ddi_modclose(smbsrv_mod);
- if (zfs_nfsshare_inited || zfs_smbshare_inited)
- (void) ddi_modclose(sharefs_mod);
-
- tsd_destroy(&zfs_fsyncer_key);
- ldi_ident_release(zfs_li);
- zfs_li = NULL;
- mutex_destroy(&zfs_share_lock);
-
- return (error);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
- return (mod_info(&modlinkage, modinfop));
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_log.c b/zfs/lib/libdmu-ctl/zfs_log.c
deleted file mode 100644
index 364385808..000000000
--- a/zfs/lib/libdmu-ctl/zfs_log.c
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_log.c 1.13 08/04/09 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/thread.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/zil_impl.h>
-#include <sys/byteorder.h>
-#include <sys/policy.h>
-#include <sys/stat.h>
-#include <sys/mode.h>
-#include <sys/acl.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/zfs_fuid.h>
-#include <sys/ddi.h>
-
-/*
- * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * an intent log transaction
- * structure (itx_t) and save within it all the information necessary to
- * possibly replay the transaction. The itx is then assigned a sequence
- * number and inserted in the in-memory list anchored in the zilog.
- */
-
-int
-zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
-{
- int isxvattr = (vap->va_mask & AT_XVATTR);
- switch (type) {
- case Z_FILE:
- if (vsecp == NULL && !isxvattr)
- return (TX_CREATE);
- if (vsecp && isxvattr)
- return (TX_CREATE_ACL_ATTR);
- if (vsecp)
- return (TX_CREATE_ACL);
- else
- return (TX_CREATE_ATTR);
- /*NOTREACHED*/
- case Z_DIR:
- if (vsecp == NULL && !isxvattr)
- return (TX_MKDIR);
- if (vsecp && isxvattr)
- return (TX_MKDIR_ACL_ATTR);
- if (vsecp)
- return (TX_MKDIR_ACL);
- else
- return (TX_MKDIR_ATTR);
- case Z_XATTRDIR:
- return (TX_MKXATTR);
- }
- ASSERT(0);
- return (TX_MAX_TYPE);
-}
-
-/*
- * build up the log data necessary for logging xvattr_t
- * First lr_attr_t is initialized. following the lr_attr_t
- * is the mapsize and attribute bitmap copied from the xvattr_t.
- * Following the bitmap and bitmapsize two 64 bit words are reserved
- * for the create time which may be set. Following the create time
- * records a single 64 bit integer which has the bits to set on
- * replay for the xvattr.
- */
-static void
-zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
-{
- uint32_t *bitmap;
- uint64_t *attrs;
- uint64_t *crtime;
- xoptattr_t *xoap;
- void *scanstamp;
- int i;
-
- xoap = xva_getxoptattr(xvap);
- ASSERT(xoap);
-
- lrattr->lr_attr_masksize = xvap->xva_mapsize;
- bitmap = &lrattr->lr_attr_bitmap;
- for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
- *bitmap = xvap->xva_reqattrmap[i];
- }
-
- /* Now pack the attributes up in a single uint64_t */
- attrs = (uint64_t *)bitmap;
- crtime = attrs + 1;
- scanstamp = (caddr_t)(crtime + 2);
- *attrs = 0;
- if (XVA_ISSET_REQ(xvap, XAT_READONLY))
- *attrs |= (xoap->xoa_readonly == 0) ? 0 :
- XAT0_READONLY;
- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
- *attrs |= (xoap->xoa_hidden == 0) ? 0 :
- XAT0_HIDDEN;
- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
- *attrs |= (xoap->xoa_system == 0) ? 0 :
- XAT0_SYSTEM;
- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
- *attrs |= (xoap->xoa_archive == 0) ? 0 :
- XAT0_ARCHIVE;
- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
- *attrs |= (xoap->xoa_immutable == 0) ? 0 :
- XAT0_IMMUTABLE;
- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
- *attrs |= (xoap->xoa_nounlink == 0) ? 0 :
- XAT0_NOUNLINK;
- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
- *attrs |= (xoap->xoa_appendonly == 0) ? 0 :
- XAT0_APPENDONLY;
- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
- *attrs |= (xoap->xoa_opaque == 0) ? 0 :
- XAT0_APPENDONLY;
- if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
- *attrs |= (xoap->xoa_nodump == 0) ? 0 :
- XAT0_NODUMP;
- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
- *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
- XAT0_AV_QUARANTINED;
- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
- *attrs |= (xoap->xoa_av_modified == 0) ? 0 :
- XAT0_AV_MODIFIED;
- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
- ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
- bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
-}
-
-static void *
-zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
-{
- zfs_fuid_t *zfuid;
- uint64_t *fuidloc = start;
-
- /* First copy in the ACE FUIDs */
- for (zfuid = list_head(&fuidp->z_fuids); zfuid;
- zfuid = list_next(&fuidp->z_fuids, zfuid)) {
- *fuidloc++ = zfuid->z_logfuid;
- }
- return (fuidloc);
-}
-
-
-static void *
-zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
-{
- zfs_fuid_domain_t *zdomain;
-
- /* now copy in the domain info, if any */
- if (fuidp->z_domain_str_sz != 0) {
- for (zdomain = list_head(&fuidp->z_domains); zdomain;
- zdomain = list_next(&fuidp->z_domains, zdomain)) {
- bcopy((void *)zdomain->z_domain, start,
- strlen(zdomain->z_domain) + 1);
- start = (caddr_t)start +
- strlen(zdomain->z_domain) + 1;
- }
- }
- return (start);
-}
-
-/*
- * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
- * TX_MKDIR_ATTR and TX_MKXATTR
- * transactions.
- *
- * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
- * domain information appended prior to the name. In this case the
- * uid/gid in the log record will be a log centric FUID.
- *
- * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
- * may contain attributes, ACL and optional fuid information.
- *
- * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
- * and ACL and normal users/groups in the ACEs.
- *
- * There may be an optional xvattr attribute information similar
- * to zfs_log_setattr.
- *
- * Also, after the file name "domain" strings may be appended.
- */
-void
-zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
- zfs_fuid_info_t *fuidp, vattr_t *vap)
-{
- itx_t *itx;
- uint64_t seq;
- lr_create_t *lr;
- lr_acl_create_t *lracl;
- size_t aclsize;
- size_t xvatsize = 0;
- size_t txsize;
- xvattr_t *xvap = (xvattr_t *)vap;
- void *end;
- size_t lrsize;
-
- size_t namesize = strlen(name) + 1;
- size_t fuidsz = 0;
-
- if (zilog == NULL)
- return;
-
- /*
- * If we have FUIDs present then add in space for
- * domains and ACE fuid's if any.
- */
- if (fuidp) {
- fuidsz += fuidp->z_domain_str_sz;
- fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
- }
-
- if (vap->va_mask & AT_XVATTR)
- xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
-
- if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
- (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
- (int)txtype == TX_MKXATTR) {
- txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
- lrsize = sizeof (*lr);
- } else {
- aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
- txsize =
- sizeof (lr_acl_create_t) + namesize + fuidsz +
- ZIL_ACE_LENGTH(aclsize) + xvatsize;
- lrsize = sizeof (lr_acl_create_t);
- }
-
- itx = zil_itx_create(txtype, txsize);
-
- lr = (lr_create_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
- lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
- } else {
- lr->lr_uid = fuidp->z_fuid_owner;
- }
- if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
- lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
- } else {
- lr->lr_gid = fuidp->z_fuid_group;
- }
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- lr->lr_rdev = zp->z_phys->zp_rdev;
-
- /*
- * Fill in xvattr info if any
- */
- if (vap->va_mask & AT_XVATTR) {
- zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
- end = (caddr_t)lr + lrsize + xvatsize;
- } else {
- end = (caddr_t)lr + lrsize;
- }
-
- /* Now fill in any ACL info */
-
- if (vsecp) {
- lracl = (lr_acl_create_t *)&itx->itx_lr;
- lracl->lr_aclcnt = vsecp->vsa_aclcnt;
- lracl->lr_acl_bytes = aclsize;
- lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
- lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
- if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
- lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
- else
- lracl->lr_acl_flags = 0;
-
- bcopy(vsecp->vsa_aclentp, end, aclsize);
- end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
- }
-
- /* drop in FUID info */
- if (fuidp) {
- end = zfs_log_fuid_ids(fuidp, end);
- end = zfs_log_fuid_domains(fuidp, end);
- }
- /*
- * Now place file name in log record
- */
- bcopy(name, end, namesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
- */
-void
-zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name)
-{
- itx_t *itx;
- uint64_t seq;
- lr_remove_t *lr;
- size_t namesize = strlen(name) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
- lr = (lr_remove_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- bcopy(name, (char *)(lr + 1), namesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_link() handles TX_LINK transactions.
- */
-void
-zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, znode_t *zp, char *name)
-{
- itx_t *itx;
- uint64_t seq;
- lr_link_t *lr;
- size_t namesize = strlen(name) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
- lr = (lr_link_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- lr->lr_link_obj = zp->z_id;
- bcopy(name, (char *)(lr + 1), namesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_symlink() handles TX_SYMLINK transactions.
- */
-void
-zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, znode_t *zp, char *name, char *link)
-{
- itx_t *itx;
- uint64_t seq;
- lr_create_t *lr;
- size_t namesize = strlen(name) + 1;
- size_t linksize = strlen(link) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
- lr = (lr_create_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- lr->lr_uid = zp->z_phys->zp_uid;
- lr->lr_gid = zp->z_phys->zp_gid;
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- bcopy(name, (char *)(lr + 1), namesize);
- bcopy(link, (char *)(lr + 1) + namesize, linksize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_rename() handles TX_RENAME transactions.
- */
-void
-zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
-{
- itx_t *itx;
- uint64_t seq;
- lr_rename_t *lr;
- size_t snamesize = strlen(sname) + 1;
- size_t dnamesize = strlen(dname) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
- lr = (lr_rename_t *)&itx->itx_lr;
- lr->lr_sdoid = sdzp->z_id;
- lr->lr_tdoid = tdzp->z_id;
- bcopy(sname, (char *)(lr + 1), snamesize);
- bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- sdzp->z_last_itx = seq;
- tdzp->z_last_itx = seq;
- szp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_write() handles TX_WRITE transactions.
- */
-ssize_t zfs_immediate_write_sz = 32768;
-
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
- sizeof (lr_write_t))
-
-void
-zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t resid, int ioflag)
-{
- itx_wr_state_t write_state;
- boolean_t slogging;
- uintptr_t fsync_cnt;
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- /*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- * If the write is greater than zfs_immediate_write_sz and there are
- * no separate logs in this pool then later *if* we need to log the
- * write then dmu_sync() is used to immediately write the block and
- * its block pointer is put in the log record.
- * WR_COPIED:
- * If we know we'll immediately be committing the
- * transaction (FSYNC or FDSYNC), the we allocate a larger
- * log record here for the data and copy the data in.
- * WR_NEED_COPY:
- * Otherwise we don't allocate a buffer, and *if* we need to
- * flush the write later then a buffer is allocated and
- * we retrieve the data using the dmu.
- */
- slogging = spa_has_slogs(zilog->zl_spa);
- if (resid > zfs_immediate_write_sz && !slogging)
- write_state = WR_INDIRECT;
- else if (ioflag & (FSYNC | FDSYNC))
- write_state = WR_COPIED;
- else
- write_state = WR_NEED_COPY;
-
- if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
- (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
- }
-
- while (resid) {
- itx_t *itx;
- lr_write_t *lr;
- ssize_t len;
-
- /*
- * If there are slogs and the write would overflow the largest
- * block, then because we don't want to use the main pool
- * to dmu_sync, we have to split the write.
- */
- if (slogging && resid > ZIL_MAX_LOG_DATA)
- len = SPA_MAXBLOCKSIZE >> 1;
- else
- len = resid;
-
- itx = zil_itx_create(txtype, sizeof (*lr) +
- (write_state == WR_COPIED ? len : 0));
- lr = (lr_write_t *)&itx->itx_lr;
- if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
- zp->z_id, off, len, lr + 1) != 0) {
- kmem_free(itx, offsetof(itx_t, itx_lr) +
- itx->itx_lr.lrc_reclen);
- itx = zil_itx_create(txtype, sizeof (*lr));
- lr = (lr_write_t *)&itx->itx_lr;
- write_state = WR_NEED_COPY;
- }
-
- itx->itx_wr_state = write_state;
- if (write_state == WR_NEED_COPY)
- itx->itx_sod += len;
- lr->lr_foid = zp->z_id;
- lr->lr_offset = off;
- lr->lr_length = len;
- lr->lr_blkoff = 0;
- BP_ZERO(&lr->lr_blkptr);
-
- itx->itx_private = zp->z_zfsvfs;
-
- if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
- (ioflag & (FSYNC | FDSYNC)))
- itx->itx_sync = B_TRUE;
- else
- itx->itx_sync = B_FALSE;
-
- zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
-
- off += len;
- resid -= len;
- }
-}
-
-/*
- * zfs_log_truncate() handles TX_TRUNCATE transactions.
- */
-void
-zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, uint64_t off, uint64_t len)
-{
- itx_t *itx;
- uint64_t seq;
- lr_truncate_t *lr;
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr));
- lr = (lr_truncate_t *)&itx->itx_lr;
- lr->lr_foid = zp->z_id;
- lr->lr_offset = off;
- lr->lr_length = len;
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_setattr() handles TX_SETATTR transactions.
- */
-void
-zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
-{
- itx_t *itx;
- uint64_t seq;
- lr_setattr_t *lr;
- xvattr_t *xvap = (xvattr_t *)vap;
- size_t recsize = sizeof (lr_setattr_t);
- void *start;
-
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- /*
- * If XVATTR set, then log record size needs to allow
- * for lr_attr_t + xvattr mask, mapsize and create time
- * plus actual attribute values
- */
- if (vap->va_mask & AT_XVATTR)
- recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
-
- if (fuidp)
- recsize += fuidp->z_domain_str_sz;
-
- itx = zil_itx_create(txtype, recsize);
- lr = (lr_setattr_t *)&itx->itx_lr;
- lr->lr_foid = zp->z_id;
- lr->lr_mask = (uint64_t)mask_applied;
- lr->lr_mode = (uint64_t)vap->va_mode;
- if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
- lr->lr_uid = fuidp->z_fuid_owner;
- else
- lr->lr_uid = (uint64_t)vap->va_uid;
-
- if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
- lr->lr_gid = fuidp->z_fuid_group;
- else
- lr->lr_gid = (uint64_t)vap->va_gid;
-
- lr->lr_size = (uint64_t)vap->va_size;
- ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
- ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
- start = (lr_setattr_t *)(lr + 1);
- if (vap->va_mask & AT_XVATTR) {
- zfs_log_xvattr((lr_attr_t *)start, xvap);
- start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
- }
-
- /*
- * Now stick on domain information if any on end
- */
-
- if (fuidp)
- (void) zfs_log_fuid_domains(fuidp, start);
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_acl() handles TX_ACL transactions.
- */
-void
-zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
- vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
-{
- itx_t *itx;
- uint64_t seq;
- lr_acl_v0_t *lrv0;
- lr_acl_t *lr;
- int txtype;
- int lrsize;
- size_t txsize;
- size_t aclbytes = vsecp->vsa_aclentsz;
-
- txtype = (zp->z_zfsvfs->z_version == ZPL_VERSION_INITIAL) ?
- TX_ACL_V0 : TX_ACL;
-
- if (txtype == TX_ACL)
- lrsize = sizeof (*lr);
- else
- lrsize = sizeof (*lrv0);
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- txsize = lrsize +
- ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
- (fuidp ? fuidp->z_domain_str_sz : 0) +
- sizeof (uint64) * (fuidp ? fuidp->z_fuid_cnt : 0);
-
- itx = zil_itx_create(txtype, txsize);
-
- lr = (lr_acl_t *)&itx->itx_lr;
- lr->lr_foid = zp->z_id;
- if (txtype == TX_ACL) {
- lr->lr_acl_bytes = aclbytes;
- lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
- lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
- if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
- lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
- else
- lr->lr_acl_flags = 0;
- }
- lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
-
- if (txtype == TX_ACL_V0) {
- lrv0 = (lr_acl_v0_t *)lr;
- bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
- } else {
- void *start = (ace_t *)(lr + 1);
-
- bcopy(vsecp->vsa_aclentp, start, aclbytes);
-
- start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
-
- if (fuidp) {
- start = zfs_log_fuid_ids(fuidp, start);
- (void) zfs_log_fuid_domains(fuidp, start);
- }
- }
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_replay.c b/zfs/lib/libdmu-ctl/zfs_replay.c
deleted file mode 100644
index ca9990d7c..000000000
--- a/zfs/lib/libdmu-ctl/zfs_replay.c
+++ /dev/null
@@ -1,876 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_replay.c 1.7 08/01/14 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/thread.h>
-#include <sys/file.h>
-#include <sys/fcntl.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_fuid.h>
-#include <sys/spa.h>
-#include <sys/zil.h>
-#include <sys/byteorder.h>
-#include <sys/stat.h>
-#include <sys/mode.h>
-#include <sys/acl.h>
-#include <sys/atomic.h>
-#include <sys/cred.h>
-
-/*
- * Functions to replay ZFS intent log (ZIL) records
- * The functions are called through a function vector (zfs_replay_vector)
- * which is indexed by the transaction type.
- */
-
-static void
-zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
- uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
-{
- bzero(vap, sizeof (*vap));
- vap->va_mask = (uint_t)mask;
- vap->va_type = IFTOVT(mode);
- vap->va_mode = mode & MODEMASK;
- vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
- vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
- vap->va_rdev = zfs_cmpldev(rdev);
- vap->va_nodeid = nodeid;
-}
-
-/* ARGSUSED */
-static int
-zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
-{
- return (ENOTSUP);
-}
-
-static void
-zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
-{
- xoptattr_t *xoap = NULL;
- uint64_t *attrs;
- uint64_t *crtime;
- uint32_t *bitmap;
- void *scanstamp;
- int i;
-
- xvap->xva_vattr.va_mask |= AT_XVATTR;
- if ((xoap = xva_getxoptattr(xvap)) == NULL) {
- xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
- return;
- }
-
- ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
-
- bitmap = &lrattr->lr_attr_bitmap;
- for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
- xvap->xva_reqattrmap[i] = *bitmap;
-
- attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
- crtime = attrs + 1;
- scanstamp = (caddr_t)(crtime + 2);
-
- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
- xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
- xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
- xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_READONLY))
- xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
- xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
- xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
- xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
- xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
- xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
- xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
- xoap->xoa_av_quarantined =
- ((*attrs & XAT0_AV_QUARANTINED) != 0);
- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
- ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
- bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
-}
-
-static int
-zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
-{
- uint64_t uid_idx;
- uint64_t gid_idx;
- int domcnt = 0;
-
- uid_idx = FUID_INDEX(uid);
- gid_idx = FUID_INDEX(gid);
- if (uid_idx)
- domcnt++;
- if (gid_idx > 0 && gid_idx != uid_idx)
- domcnt++;
-
- return (domcnt);
-}
-
-static void *
-zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
- int domcnt)
-{
- int i;
-
- for (i = 0; i != domcnt; i++) {
- fuid_infop->z_domain_table[i] = start;
- start = (caddr_t)start + strlen(start) + 1;
- }
-
- return (start);
-}
-
-/*
- * Set the uid/gid in the fuid_info structure.
- */
-static void
-zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
-{
- /*
- * If owner or group are log specific FUIDs then slurp up
- * domain information and build zfs_fuid_info_t
- */
- if (IS_EPHEMERAL(uid))
- fuid_infop->z_fuid_owner = uid;
-
- if (IS_EPHEMERAL(gid))
- fuid_infop->z_fuid_group = gid;
-}
-
-/*
- * Load fuid domains into fuid_info_t
- */
-static zfs_fuid_info_t *
-zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
-{
- int domcnt;
-
- zfs_fuid_info_t *fuid_infop;
-
- fuid_infop = zfs_fuid_info_alloc();
-
- domcnt = zfs_replay_domain_cnt(uid, gid);
-
- if (domcnt == 0)
- return (fuid_infop);
-
- fuid_infop->z_domain_table =
- kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
-
- zfs_replay_fuid_ugid(fuid_infop, uid, gid);
-
- fuid_infop->z_domain_cnt = domcnt;
- *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
- return (fuid_infop);
-}
-
-/*
- * load zfs_fuid_t's and fuid_domains into fuid_info_t
- */
-static zfs_fuid_info_t *
-zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
- uint64_t gid)
-{
- uint64_t *log_fuid = (uint64_t *)start;
- zfs_fuid_info_t *fuid_infop;
- int i;
-
- fuid_infop = zfs_fuid_info_alloc();
- fuid_infop->z_domain_cnt = domcnt;
-
- fuid_infop->z_domain_table =
- kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
-
- for (i = 0; i != idcnt; i++) {
- zfs_fuid_t *zfuid;
-
- zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
- zfuid->z_logfuid = *log_fuid;
- zfuid->z_id = -1;
- zfuid->z_domidx = 0;
- list_insert_tail(&fuid_infop->z_fuids, zfuid);
- log_fuid++;
- }
-
- zfs_replay_fuid_ugid(fuid_infop, uid, gid);
-
- *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
- return (fuid_infop);
-}
-
-static void
-zfs_replay_swap_attrs(lr_attr_t *lrattr)
-{
- /* swap the lr_attr structure */
- byteswap_uint32_array(lrattr, sizeof (*lrattr));
- /* swap the bitmap */
- byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
- sizeof (uint32_t));
- /* swap the attributes, create time + 64 bit word for attributes */
- byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
- (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
-}
-
-/*
- * Replay file create with optional ACL, xvattr information as well
- * as option FUID information.
- */
-static int
-zfs_replay_create_acl(zfsvfs_t *zfsvfs,
- lr_acl_create_t *lracl, boolean_t byteswap)
-{
- char *name = NULL; /* location determined later */
- lr_create_t *lr = (lr_create_t *)lracl;
- znode_t *dzp;
- vnode_t *vp = NULL;
- xvattr_t xva;
- int vflg = 0;
- vsecattr_t vsec = { 0 };
- lr_attr_t *lrattr;
- void *aclstart;
- void *fuidstart;
- size_t xvatlen = 0;
- uint64_t txtype;
- int error;
-
- if (byteswap) {
- byteswap_uint64_array(lracl, sizeof (*lracl));
- txtype = (int)lr->lr_common.lrc_txtype;
- if (txtype == TX_CREATE_ACL_ATTR ||
- txtype == TX_MKDIR_ACL_ATTR) {
- lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
- zfs_replay_swap_attrs(lrattr);
- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
- }
-
- aclstart = (caddr_t)(lracl + 1) + xvatlen;
- zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
- /* swap fuids */
- if (lracl->lr_fuidcnt) {
- byteswap_uint64_array((caddr_t)aclstart +
- ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
- lracl->lr_fuidcnt * sizeof (uint64_t));
- }
- }
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- xva_init(&xva);
- zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
-
- /*
- * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
- * eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic VOP_CREATE()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
- */
- ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
- xva.xva_vattr.va_nblocks = lr->lr_gen;
-
- error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
- if (error != ENOENT)
- goto bail;
-
- if (lr->lr_common.lrc_txtype & TX_CI)
- vflg |= FIGNORECASE;
- switch ((int)lr->lr_common.lrc_txtype) {
- case TX_CREATE_ACL:
- aclstart = (caddr_t)(lracl + 1);
- fuidstart = (caddr_t)aclstart +
- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
- zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
- lr->lr_uid, lr->lr_gid);
- /*FALLTHROUGH*/
- case TX_CREATE_ACL_ATTR:
- if (name == NULL) {
- lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
- xva.xva_vattr.va_mask |= AT_XVATTR;
- zfs_replay_xvattr(lrattr, &xva);
- }
- vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
- vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
- vsec.vsa_aclcnt = lracl->lr_aclcnt;
- vsec.vsa_aclentsz = lracl->lr_acl_bytes;
- vsec.vsa_aclflags = lracl->lr_acl_flags;
- if (zfsvfs->z_fuid_replay == NULL) {
- fuidstart = (caddr_t)(lracl + 1) + xvatlen +
- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
- zfsvfs->z_fuid_replay =
- zfs_replay_fuids(fuidstart,
- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
- lr->lr_uid, lr->lr_gid);
- }
-
- error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
- 0, 0, &vp, kcred, vflg, NULL, &vsec);
- break;
- case TX_MKDIR_ACL:
- aclstart = (caddr_t)(lracl + 1);
- fuidstart = (caddr_t)aclstart +
- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
- zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
- lr->lr_uid, lr->lr_gid);
- /*FALLTHROUGH*/
- case TX_MKDIR_ACL_ATTR:
- if (name == NULL) {
- lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
- zfs_replay_xvattr(lrattr, &xva);
- }
- vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
- vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
- vsec.vsa_aclcnt = lracl->lr_aclcnt;
- vsec.vsa_aclentsz = lracl->lr_acl_bytes;
- vsec.vsa_aclflags = lracl->lr_acl_flags;
- if (zfsvfs->z_fuid_replay == NULL) {
- fuidstart = (caddr_t)(lracl + 1) + xvatlen +
- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
- zfsvfs->z_fuid_replay =
- zfs_replay_fuids(fuidstart,
- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
- lr->lr_uid, lr->lr_gid);
- }
- error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
- &vp, kcred, NULL, vflg, &vsec);
- break;
- default:
- error = ENOTSUP;
- }
-
-bail:
- if (error == 0 && vp != NULL)
- VN_RELE(vp);
-
- VN_RELE(ZTOV(dzp));
-
- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
- zfsvfs->z_fuid_replay = NULL;
-
- return (error);
-}
-
-static int
-zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
-{
- char *name = NULL; /* location determined later */
- char *link; /* symlink content follows name */
- znode_t *dzp;
- vnode_t *vp = NULL;
- xvattr_t xva;
- int vflg = 0;
- size_t lrsize = sizeof (lr_create_t);
- lr_attr_t *lrattr;
- void *start;
- size_t xvatlen;
- uint64_t txtype;
- int error;
-
- if (byteswap) {
- byteswap_uint64_array(lr, sizeof (*lr));
- txtype = (int)lr->lr_common.lrc_txtype;
- if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
- zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
- }
-
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- xva_init(&xva);
- zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
-
- /*
- * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
- * eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic VOP_CREATE()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
- */
- ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
- xva.xva_vattr.va_nblocks = lr->lr_gen;
-
- error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
- if (error != ENOENT)
- goto out;
-
- if (lr->lr_common.lrc_txtype & TX_CI)
- vflg |= FIGNORECASE;
-
- /*
- * Symlinks don't have fuid info, and CIFS never creates
- * symlinks.
- *
- * The _ATTR versions will grab the fuid info in their subcases.
- */
- if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
- (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
- (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
- start = (lr + 1);
- zfsvfs->z_fuid_replay =
- zfs_replay_fuid_domain(start, &start,
- lr->lr_uid, lr->lr_gid);
- }
-
- switch ((int)lr->lr_common.lrc_txtype) {
- case TX_CREATE_ATTR:
- lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
- zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
- start = (caddr_t)(lr + 1) + xvatlen;
- zfsvfs->z_fuid_replay =
- zfs_replay_fuid_domain(start, &start,
- lr->lr_uid, lr->lr_gid);
- name = (char *)start;
-
- /*FALLTHROUGH*/
- case TX_CREATE:
- if (name == NULL)
- name = (char *)start;
-
- error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
- 0, 0, &vp, kcred, vflg, NULL, NULL);
- break;
- case TX_MKDIR_ATTR:
- lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
- zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
- start = (caddr_t)(lr + 1) + xvatlen;
- zfsvfs->z_fuid_replay =
- zfs_replay_fuid_domain(start, &start,
- lr->lr_uid, lr->lr_gid);
- name = (char *)start;
-
- /*FALLTHROUGH*/
- case TX_MKDIR:
- if (name == NULL)
- name = (char *)(lr + 1);
-
- error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
- &vp, kcred, NULL, vflg, NULL);
- break;
- case TX_MKXATTR:
- name = (char *)(lr + 1);
- error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
- break;
- case TX_SYMLINK:
- name = (char *)(lr + 1);
- link = name + strlen(name) + 1;
- error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr,
- link, kcred, NULL, vflg);
- break;
- default:
- error = ENOTSUP;
- }
-
-out:
- if (error == 0 && vp != NULL)
- VN_RELE(vp);
-
- VN_RELE(ZTOV(dzp));
-
- if (zfsvfs->z_fuid_replay)
- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
- zfsvfs->z_fuid_replay = NULL;
- return (error);
-}
-
-static int
-zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
-{
- char *name = (char *)(lr + 1); /* name follows lr_remove_t */
- znode_t *dzp;
- int error;
- int vflg = 0;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- if (lr->lr_common.lrc_txtype & TX_CI)
- vflg |= FIGNORECASE;
-
- switch ((int)lr->lr_common.lrc_txtype) {
- case TX_REMOVE:
- error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg);
- break;
- case TX_RMDIR:
- error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg);
- break;
- default:
- error = ENOTSUP;
- }
-
- VN_RELE(ZTOV(dzp));
-
- return (error);
-}
-
-static int
-zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
-{
- char *name = (char *)(lr + 1); /* name follows lr_link_t */
- znode_t *dzp, *zp;
- int error;
- int vflg = 0;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
- VN_RELE(ZTOV(dzp));
- return (error);
- }
-
- if (lr->lr_common.lrc_txtype & TX_CI)
- vflg |= FIGNORECASE;
-
- error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg);
-
- VN_RELE(ZTOV(zp));
- VN_RELE(ZTOV(dzp));
-
- return (error);
-}
-
-static int
-zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
-{
- char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
- char *tname = sname + strlen(sname) + 1;
- znode_t *sdzp, *tdzp;
- int error;
- int vflg = 0;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
- return (error);
-
- if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
- VN_RELE(ZTOV(sdzp));
- return (error);
- }
-
- if (lr->lr_common.lrc_txtype & TX_CI)
- vflg |= FIGNORECASE;
-
- error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred,
- NULL, vflg);
-
- VN_RELE(ZTOV(tdzp));
- VN_RELE(ZTOV(sdzp));
-
- return (error);
-}
-
-static int
-zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
-{
- char *data = (char *)(lr + 1); /* data follows lr_write_t */
- znode_t *zp;
- int error;
- ssize_t resid;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log writes out of order, it's possible the
- * file has been removed. In this case just drop the write
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
- lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
-
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-static int
-zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
-{
- znode_t *zp;
- flock64_t fl;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log truncates out of order, it's possible the
- * file has been removed. In this case just drop the truncate
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- bzero(&fl, sizeof (fl));
- fl.l_type = F_WRLCK;
- fl.l_whence = 0;
- fl.l_start = lr->lr_offset;
- fl.l_len = lr->lr_length;
-
- error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
- lr->lr_offset, kcred, NULL);
-
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-static int
-zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
-{
- znode_t *zp;
- xvattr_t xva;
- vattr_t *vap = &xva.xva_vattr;
- int error;
- void *start;
-
- xva_init(&xva);
- if (byteswap) {
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((lr->lr_mask & AT_XVATTR) &&
- zfsvfs->z_version >= ZPL_VERSION_INITIAL)
- zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
- }
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log setattrs out of order, it's possible the
- * file has been removed. In this case just drop the setattr
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
- lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
-
- vap->va_size = lr->lr_size;
- ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
- ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
-
- /*
- * Fill in xvattr_t portions if necessary.
- */
-
- start = (lr_setattr_t *)(lr + 1);
- if (vap->va_mask & AT_XVATTR) {
- zfs_replay_xvattr((lr_attr_t *)start, &xva);
- start = (caddr_t)start +
- ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
- } else
- xva.xva_vattr.va_mask &= ~AT_XVATTR;
-
- zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
- lr->lr_uid, lr->lr_gid);
-
- error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL);
-
- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
- zfsvfs->z_fuid_replay = NULL;
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-static int
-zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
-{
- ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
- vsecattr_t vsa;
- znode_t *zp;
- int error;
-
- if (byteswap) {
- byteswap_uint64_array(lr, sizeof (*lr));
- zfs_oldace_byteswap(ace, lr->lr_aclcnt);
- }
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- bzero(&vsa, sizeof (vsa));
- vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
- vsa.vsa_aclcnt = lr->lr_aclcnt;
- vsa.vsa_aclentp = ace;
-
- error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
-
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-/*
- * Replaying ACLs is complicated by FUID support.
- * The log record may contain some optional data
- * to be used for replaying FUID's. These pieces
- * are the actual FUIDs that were created initially.
- * The FUID table index may no longer be valid and
- * during zfs_create() a new index may be assigned.
- * Because of this the log will contain the original
- * doman+rid in order to create a new FUID.
- *
- * The individual ACEs may contain an ephemeral uid/gid which is no
- * longer valid and will need to be replaced with an actual FUID.
- *
- */
-static int
-zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
-{
- ace_t *ace = (ace_t *)(lr + 1);
- vsecattr_t vsa;
- znode_t *zp;
- int error;
-
- if (byteswap) {
- byteswap_uint64_array(lr, sizeof (*lr));
- zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
- if (lr->lr_fuidcnt) {
- byteswap_uint64_array((caddr_t)ace +
- ZIL_ACE_LENGTH(lr->lr_acl_bytes),
- lr->lr_fuidcnt * sizeof (uint64_t));
- }
- }
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- bzero(&vsa, sizeof (vsa));
- vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
- vsa.vsa_aclcnt = lr->lr_aclcnt;
- vsa.vsa_aclentp = ace;
- vsa.vsa_aclentsz = lr->lr_acl_bytes;
- vsa.vsa_aclflags = lr->lr_acl_flags;
-
- if (lr->lr_fuidcnt) {
- void *fuidstart = (caddr_t)ace +
- ZIL_ACE_LENGTH(lr->lr_acl_bytes);
-
- zfsvfs->z_fuid_replay =
- zfs_replay_fuids(fuidstart, &fuidstart,
- lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
- }
-
- error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
-
- if (zfsvfs->z_fuid_replay)
- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-
- zfsvfs->z_fuid_replay = NULL;
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-/*
- * Callback vectors for replaying records
- */
-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
- zfs_replay_error, /* 0 no such transaction type */
- zfs_replay_create, /* TX_CREATE */
- zfs_replay_create, /* TX_MKDIR */
- zfs_replay_create, /* TX_MKXATTR */
- zfs_replay_create, /* TX_SYMLINK */
- zfs_replay_remove, /* TX_REMOVE */
- zfs_replay_remove, /* TX_RMDIR */
- zfs_replay_link, /* TX_LINK */
- zfs_replay_rename, /* TX_RENAME */
- zfs_replay_write, /* TX_WRITE */
- zfs_replay_truncate, /* TX_TRUNCATE */
- zfs_replay_setattr, /* TX_SETATTR */
- zfs_replay_acl_v0, /* TX_ACL_V0 */
- zfs_replay_acl, /* TX_ACL */
- zfs_replay_create_acl, /* TX_CREATE_ACL */
- zfs_replay_create, /* TX_CREATE_ATTR */
- zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */
- zfs_replay_create_acl, /* TX_MKDIR_ACL */
- zfs_replay_create, /* TX_MKDIR_ATTR */
- zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
-};
diff --git a/zfs/lib/libdmu-ctl/zfs_rlock.c b/zfs/lib/libdmu-ctl/zfs_rlock.c
deleted file mode 100644
index 44ec73b5d..000000000
--- a/zfs/lib/libdmu-ctl/zfs_rlock.c
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_rlock.c 1.4 07/08/08 SMI"
-
-/*
- * This file contains the code to implement file range locking in
- * ZFS, although there isn't much specific to ZFS (all that comes to mind
- * support for growing the blocksize).
- *
- * Interface
- * ---------
- * Defined in zfs_rlock.h but essentially:
- * rl = zfs_range_lock(zp, off, len, lock_type);
- * zfs_range_unlock(rl);
- * zfs_range_reduce(rl, off, len);
- *
- * AVL tree
- * --------
- * An AVL tree is used to maintain the state of the existing ranges
- * that are locked for exclusive (writer) or shared (reader) use.
- * The starting range offset is used for searching and sorting the tree.
- *
- * Common case
- * -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
- *
- * Overlaps/Reference counting/Proxy locks
- * ---------------------------------------
- * The avl code only allows one node at a particular offset. Also it's very
- * inefficient to search through all previous entries looking for overlaps
- * (because the very 1st in the ordered list might be at offset 0 but
- * cover the whole file).
- * So this implementation uses reference counts and proxy range locks.
- * Firstly, only reader locks use reference counts and proxy locks,
- * because writer locks are exclusive.
- * When a reader lock overlaps with another then a proxy lock is created
- * for that range and replaces the original lock. If the overlap
- * is exact then the reference count of the proxy is simply incremented.
- * Otherwise, the proxy lock is split into smaller lock ranges and
- * new proxy locks created for non overlapping ranges.
- * The reference counts are adjusted accordingly.
- * Meanwhile, the orginal lock is kept around (this is the callers handle)
- * and its offset and length are used when releasing the lock.
- *
- * Thread coordination
- * -------------------
- * In order to make wakeups efficient and to ensure multiple continuous
- * readers on a range don't starve a writer for the same range lock,
- * two condition variables are allocated in each rl_t.
- * If a writer (or reader) can't get a range it initialises the writer
- * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
- * and waits on that cv. When a thread unlocks that range it wakes up all
- * writers then all readers before destroying the lock.
- *
- * Append mode writes
- * ------------------
- * Append mode writes need to lock a range at the end of a file.
- * The offset of the end of the file is determined under the
- * range locking mutex, and the lock type converted from RL_APPEND to
- * RL_WRITER and the range locked.
- *
- * Grow block handling
- * -------------------
- * ZFS supports multiple block sizes currently upto 128K. The smallest
- * block size is used for the file which is grown as needed. During this
- * growth all other writers and readers must be excluded.
- * So if the block size needs to be grown then the whole file is
- * exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
- */
-
-#include <sys/zfs_rlock.h>
-
-/*
- * Check if a write lock can be grabbed, or wait and recheck until available.
- */
-static void
-zfs_range_lock_writer(znode_t *zp, rl_t *new)
-{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *rl;
- avl_index_t where;
- uint64_t end_size;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
-
- for (;;) {
- /*
- * Range locking is also used by zvol and uses a
- * dummied up znode. However, for zvol, we don't need to
- * append or grow blocksize, and besides we don't have
- * a z_phys or z_zfsvfs - so skip that processing.
- *
- * Yes, this is ugly, and would be solved by not handling
- * grow or append in range lock code. If that was done then
- * we could make the range locking code generically available
- * to other non-zfs consumers.
- */
- if (zp->z_vnode) { /* caller is ZPL */
- /*
- * If in append mode pick up the current end of file.
- * This is done under z_range_lock to avoid races.
- */
- if (new->r_type == RL_APPEND)
- new->r_off = zp->z_phys->zp_size;
-
- /*
- * If we need to grow the block size then grab the whole
- * file range. This is also done under z_range_lock to
- * avoid races.
- */
- end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
- if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
- zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
- new->r_off = 0;
- new->r_len = UINT64_MAX;
- }
- }
-
- /*
- * First check for the usual case of no locks
- */
- if (avl_numnodes(tree) == 0) {
- new->r_type = RL_WRITER; /* convert to writer */
- avl_add(tree, new);
- return;
- }
-
- /*
- * Look for any locks in the range.
- */
- rl = avl_find(tree, new, &where);
- if (rl)
- goto wait; /* already locked at same offset */
-
- rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- if (rl && (rl->r_off < new->r_off + new->r_len))
- goto wait;
-
- rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
- if (rl && rl->r_off + rl->r_len > new->r_off)
- goto wait;
-
- new->r_type = RL_WRITER; /* convert possible RL_APPEND */
- avl_insert(tree, new, where);
- return;
-wait:
- if (!rl->r_write_wanted) {
- cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
- rl->r_write_wanted = B_TRUE;
- }
- cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
-
- /* reset to original */
- new->r_off = off;
- new->r_len = len;
- }
-}
-
-/*
- * If this is an original (non-proxy) lock then replace it by
- * a proxy and return the proxy.
- */
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
-{
- rl_t *proxy;
-
- if (rl->r_proxy)
- return (rl); /* already a proxy */
-
- ASSERT3U(rl->r_cnt, ==, 1);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
- avl_remove(tree, rl);
- rl->r_cnt = 0;
-
- /* create a proxy range lock */
- proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- proxy->r_off = rl->r_off;
- proxy->r_len = rl->r_len;
- proxy->r_cnt = 1;
- proxy->r_type = RL_READER;
- proxy->r_proxy = B_TRUE;
- proxy->r_write_wanted = B_FALSE;
- proxy->r_read_wanted = B_FALSE;
- avl_add(tree, proxy);
-
- return (proxy);
-}
-
-/*
- * Split the range lock at the supplied offset
- * returning the *front* proxy.
- */
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
-{
- rl_t *front, *rear;
-
- ASSERT3U(rl->r_len, >, 1);
- ASSERT3U(off, >, rl->r_off);
- ASSERT3U(off, <, rl->r_off + rl->r_len);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
-
- /* create the rear proxy range lock */
- rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rear->r_off = off;
- rear->r_len = rl->r_off + rl->r_len - off;
- rear->r_cnt = rl->r_cnt;
- rear->r_type = RL_READER;
- rear->r_proxy = B_TRUE;
- rear->r_write_wanted = B_FALSE;
- rear->r_read_wanted = B_FALSE;
-
- front = zfs_range_proxify(tree, rl);
- front->r_len = off - rl->r_off;
-
- avl_insert_here(tree, rear, front, AVL_AFTER);
- return (front);
-}
-
-/*
- * Create and add a new proxy range lock for the supplied range.
- */
-static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
-{
- rl_t *rl;
-
- ASSERT(len);
- rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rl->r_off = off;
- rl->r_len = len;
- rl->r_cnt = 1;
- rl->r_type = RL_READER;
- rl->r_proxy = B_TRUE;
- rl->r_write_wanted = B_FALSE;
- rl->r_read_wanted = B_FALSE;
- avl_add(tree, rl);
-}
-
-static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
-{
- rl_t *next;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
-
- /*
- * prev arrives either:
- * - pointing to an entry at the same offset
- * - pointing to the entry with the closest previous offset whose
- * range may overlap with the new range
- * - null, if there were no ranges starting before the new one
- */
- if (prev) {
- if (prev->r_off + prev->r_len <= off) {
- prev = NULL;
- } else if (prev->r_off != off) {
- /*
- * convert to proxy if needed then
- * split this entry and bump ref count
- */
- prev = zfs_range_split(tree, prev, off);
- prev = AVL_NEXT(tree, prev); /* move to rear range */
- }
- }
- ASSERT((prev == NULL) || (prev->r_off == off));
-
- if (prev)
- next = prev;
- else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-
- if (next == NULL || off + len <= next->r_off) {
- /* no overlaps, use the original new rl_t in the tree */
- avl_insert(tree, new, where);
- return;
- }
-
- if (off < next->r_off) {
- /* Add a proxy for initial range before the overlap */
- zfs_range_new_proxy(tree, off, next->r_off - off);
- }
-
- new->r_cnt = 0; /* will use proxies in tree */
- /*
- * We now search forward through the ranges, until we go past the end
- * of the new range. For each entry we make it a proxy if it
- * isn't already, then bump its reference count. If there's any
- * gaps between the ranges then we create a new proxy range.
- */
- for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
- break;
- if (prev && prev->r_off + prev->r_len < next->r_off) {
- /* there's a gap */
- ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- next->r_off - (prev->r_off + prev->r_len));
- }
- if (off + len == next->r_off + next->r_len) {
- /* exact overlap with end */
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
- return;
- }
- if (off + len < next->r_off + next->r_len) {
- /* new range ends in the middle of this block */
- next = zfs_range_split(tree, next, off + len);
- next->r_cnt++;
- return;
- }
- ASSERT3U(off + len, >, next->r_off + next->r_len);
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
- }
-
- /* Add the remaining end range. */
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- (off + len) - (prev->r_off + prev->r_len));
-}
-
-/*
- * Check if a reader lock can be grabbed, or wait and recheck until available.
- */
-static void
-zfs_range_lock_reader(znode_t *zp, rl_t *new)
-{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *prev, *next;
- avl_index_t where;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
-
- /*
- * Look for any writer locks in the range.
- */
-retry:
- prev = avl_find(tree, new, &where);
- if (prev == NULL)
- prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-
- /*
- * Check the previous range for a writer lock overlap.
- */
- if (prev && (off < prev->r_off + prev->r_len)) {
- if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
- if (!prev->r_read_wanted) {
- cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
- prev->r_read_wanted = B_TRUE;
- }
- cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
- goto retry;
- }
- if (off + len < prev->r_off + prev->r_len)
- goto got_lock;
- }
-
- /*
- * Search through the following ranges to see if there's
- * write lock any overlap.
- */
- if (prev)
- next = AVL_NEXT(tree, prev);
- else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- for (; next; next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
- goto got_lock;
- if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
- if (!next->r_read_wanted) {
- cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
- next->r_read_wanted = B_TRUE;
- }
- cv_wait(&next->r_rd_cv, &zp->z_range_lock);
- goto retry;
- }
- if (off + len <= next->r_off + next->r_len)
- goto got_lock;
- }
-
-got_lock:
- /*
- * Add the read lock, which may involve splitting existing
- * locks and bumping ref counts (r_cnt).
- */
- zfs_range_add_reader(tree, new, prev, where);
-}
-
-/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
- */
-rl_t *
-zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
-{
- rl_t *new;
-
- ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
-
- new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- new->r_zp = zp;
- new->r_off = off;
- new->r_len = len;
- new->r_cnt = 1; /* assume it's going to be in the tree */
- new->r_type = type;
- new->r_proxy = B_FALSE;
- new->r_write_wanted = B_FALSE;
- new->r_read_wanted = B_FALSE;
-
- mutex_enter(&zp->z_range_lock);
- if (type == RL_READER) {
- /*
- * First check for the usual case of no locks
- */
- if (avl_numnodes(&zp->z_range_avl) == 0)
- avl_add(&zp->z_range_avl, new);
- else
- zfs_range_lock_reader(zp, new);
- } else
- zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
- mutex_exit(&zp->z_range_lock);
- return (new);
-}
-
-/*
- * Unlock a reader lock
- */
-static void
-zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
-{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *rl, *next;
- uint64_t len;
-
- /*
- * The common case is when the remove entry is in the tree
- * (cnt == 1) meaning there's been no other reader locks overlapping
- * with this one. Otherwise the remove entry will have been
- * removed from the tree and replaced by proxies (one or
- * more ranges mapping to the entire range).
- */
- if (remove->r_cnt == 1) {
- avl_remove(tree, remove);
- if (remove->r_write_wanted) {
- cv_broadcast(&remove->r_wr_cv);
- cv_destroy(&remove->r_wr_cv);
- }
- if (remove->r_read_wanted) {
- cv_broadcast(&remove->r_rd_cv);
- cv_destroy(&remove->r_rd_cv);
- }
- } else {
- ASSERT3U(remove->r_cnt, ==, 0);
- ASSERT3U(remove->r_write_wanted, ==, 0);
- ASSERT3U(remove->r_read_wanted, ==, 0);
- /*
- * Find start proxy representing this reader lock,
- * then decrement ref count on all proxies
- * that make up this range, freeing them as needed.
- */
- rl = avl_find(tree, remove, NULL);
- ASSERT(rl);
- ASSERT(rl->r_cnt);
- ASSERT(rl->r_type == RL_READER);
- for (len = remove->r_len; len != 0; rl = next) {
- len -= rl->r_len;
- if (len) {
- next = AVL_NEXT(tree, rl);
- ASSERT(next);
- ASSERT(rl->r_off + rl->r_len == next->r_off);
- ASSERT(next->r_cnt);
- ASSERT(next->r_type == RL_READER);
- }
- rl->r_cnt--;
- if (rl->r_cnt == 0) {
- avl_remove(tree, rl);
- if (rl->r_write_wanted) {
- cv_broadcast(&rl->r_wr_cv);
- cv_destroy(&rl->r_wr_cv);
- }
- if (rl->r_read_wanted) {
- cv_broadcast(&rl->r_rd_cv);
- cv_destroy(&rl->r_rd_cv);
- }
- kmem_free(rl, sizeof (rl_t));
- }
- }
- }
- kmem_free(remove, sizeof (rl_t));
-}
-
-/*
- * Unlock range and destroy range lock structure.
- */
-void
-zfs_range_unlock(rl_t *rl)
-{
- znode_t *zp = rl->r_zp;
-
- ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
- ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
- ASSERT(!rl->r_proxy);
-
- mutex_enter(&zp->z_range_lock);
- if (rl->r_type == RL_WRITER) {
- /* writer locks can't be shared or split */
- avl_remove(&zp->z_range_avl, rl);
- mutex_exit(&zp->z_range_lock);
- if (rl->r_write_wanted) {
- cv_broadcast(&rl->r_wr_cv);
- cv_destroy(&rl->r_wr_cv);
- }
- if (rl->r_read_wanted) {
- cv_broadcast(&rl->r_rd_cv);
- cv_destroy(&rl->r_rd_cv);
- }
- kmem_free(rl, sizeof (rl_t));
- } else {
- /*
- * lock may be shared, let zfs_range_unlock_reader()
- * release the lock and free the rl_t
- */
- zfs_range_unlock_reader(zp, rl);
- mutex_exit(&zp->z_range_lock);
- }
-}
-
-/*
- * Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
- * entry in the tree.
- */
-void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
-{
- znode_t *zp = rl->r_zp;
-
- /* Ensure there are no other locks */
- ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
- ASSERT(rl->r_off == 0);
- ASSERT(rl->r_type == RL_WRITER);
- ASSERT(!rl->r_proxy);
- ASSERT3U(rl->r_len, ==, UINT64_MAX);
- ASSERT3U(rl->r_cnt, ==, 1);
-
- mutex_enter(&zp->z_range_lock);
- rl->r_off = off;
- rl->r_len = len;
- mutex_exit(&zp->z_range_lock);
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
- const rl_t *rl1 = arg1;
- const rl_t *rl2 = arg2;
-
- if (rl1->r_off > rl2->r_off)
- return (1);
- if (rl1->r_off < rl2->r_off)
- return (-1);
- return (0);
-}
diff --git a/zfs/lib/libdmu-ctl/zfs_vfsops.c b/zfs/lib/libdmu-ctl/zfs_vfsops.c
deleted file mode 100644
index 39c8ce4ef..000000000
--- a/zfs/lib/libdmu-ctl/zfs_vfsops.c
+++ /dev/null
@@ -1,1671 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zfs_vfsops.c 1.41 08/04/11 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/kmem.h>
-#include <sys/pathname.h>
-#include <sys/vnode.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-#include <sys/mntent.h>
-#include <sys/mount.h>
-#include <sys/cmn_err.h>
-#include "fs/fs_subr.h"
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_deleg.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/varargs.h>
-#include <sys/policy.h>
-#include <sys/atomic.h>
-#include <sys/mkdev.h>
-#include <sys/modctl.h>
-#include <sys/refstr.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/bootconf.h>
-#include <sys/sunddi.h>
-#include <sys/dnlc.h>
-#include <sys/dmu_objset.h>
-#include <sys/spa_boot.h>
-
-int zfsfstype;
-vfsops_t *zfs_vfsops = NULL;
-static major_t zfs_major;
-static minor_t zfs_minor;
-static kmutex_t zfs_dev_mtx;
-
-static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
-static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
-static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
-static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
-static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
-static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
-static void zfs_freevfs(vfs_t *vfsp);
-
-static const fs_operation_def_t zfs_vfsops_template[] = {
- VFSNAME_MOUNT, { .vfs_mount = zfs_mount },
- VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot },
- VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount },
- VFSNAME_ROOT, { .vfs_root = zfs_root },
- VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs },
- VFSNAME_SYNC, { .vfs_sync = zfs_sync },
- VFSNAME_VGET, { .vfs_vget = zfs_vget },
- VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
- NULL, NULL
-};
-
-static const fs_operation_def_t zfs_vfsops_eio_template[] = {
- VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
- NULL, NULL
-};
-
-/*
- * We need to keep a count of active fs's.
- * This is necessary to prevent our module
- * from being unloaded after a umount -f
- */
-static uint32_t zfs_active_fs_count = 0;
-
-static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
-static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
-static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
-static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
-
-/*
- * MO_DEFAULT is not used since the default value is determined
- * by the equivalent property.
- */
-static mntopt_t mntopts[] = {
- { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
- { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
- { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
- { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
-};
-
-static mntopts_t zfs_mntopts = {
- sizeof (mntopts) / sizeof (mntopt_t),
- mntopts
-};
-
-/*ARGSUSED*/
-int
-zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
-{
- /*
- * Data integrity is job one. We don't want a compromised kernel
- * writing to the storage pool, so we never sync during panic.
- */
- if (panicstr)
- return (0);
-
- /*
- * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
- * to sync metadata, which they would otherwise cache indefinitely.
- * Semantically, the only requirement is that the sync be initiated.
- * The DMU syncs out txgs frequently, so there's nothing to do.
- */
- if (flag & SYNC_ATTR)
- return (0);
-
- if (vfsp != NULL) {
- /*
- * Sync a specific filesystem.
- */
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
- ZFS_ENTER(zfsvfs);
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
- else
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- ZFS_EXIT(zfsvfs);
- } else {
- /*
- * Sync all ZFS filesystems. This is what happens when you
- * run sync(1M). Unlike other filesystems, ZFS honors the
- * request by waiting for all pools to commit all dirty data.
- */
- spa_sync_allpools();
- }
-
- return (0);
-}
-
-static int
-zfs_create_unique_device(dev_t *dev)
-{
- major_t new_major;
-
- do {
- ASSERT3U(zfs_minor, <=, MAXMIN32);
- minor_t start = zfs_minor;
- do {
- mutex_enter(&zfs_dev_mtx);
- if (zfs_minor >= MAXMIN32) {
- /*
- * If we're still using the real major
- * keep out of /dev/zfs and /dev/zvol minor
- * number space. If we're using a getudev()'ed
- * major number, we can use all of its minors.
- */
- if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
- zfs_minor = ZFS_MIN_MINOR;
- else
- zfs_minor = 0;
- } else {
- zfs_minor++;
- }
- *dev = makedevice(zfs_major, zfs_minor);
- mutex_exit(&zfs_dev_mtx);
- } while (vfs_devismounted(*dev) && zfs_minor != start);
- if (zfs_minor == start) {
- /*
- * We are using all ~262,000 minor numbers for the
- * current major number. Create a new major number.
- */
- if ((new_major = getudev()) == (major_t)-1) {
- cmn_err(CE_WARN,
- "zfs_mount: Can't get unique major "
- "device number.");
- return (-1);
- }
- mutex_enter(&zfs_dev_mtx);
- zfs_major = new_major;
- zfs_minor = 0;
-
- mutex_exit(&zfs_dev_mtx);
- } else {
- break;
- }
- /* CONSTANTCONDITION */
- } while (1);
-
- return (0);
-}
-
-static void
-atime_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == TRUE) {
- zfsvfs->z_atime = TRUE;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
- } else {
- zfsvfs->z_atime = FALSE;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
- }
-}
-
-static void
-xattr_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == TRUE) {
- /* XXX locking on vfs_flag? */
- zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
- } else {
- /* XXX locking on vfs_flag? */
- zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
- }
-}
-
-static void
-blksz_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval < SPA_MINBLOCKSIZE ||
- newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
- newval = SPA_MAXBLOCKSIZE;
-
- zfsvfs->z_max_blksz = newval;
- zfsvfs->z_vfs->vfs_bsize = newval;
-}
-
-static void
-readonly_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval) {
- /* XXX locking on vfs_flag? */
- zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
- } else {
- /* XXX locking on vfs_flag? */
- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
- }
-}
-
-static void
-devices_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == FALSE) {
- zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
- } else {
- zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
- }
-}
-
-static void
-setuid_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == FALSE) {
- zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
- } else {
- zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
- }
-}
-
-static void
-exec_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == FALSE) {
- zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
- } else {
- zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
- }
-}
-
-/*
- * The nbmand mount option can be changed at mount time.
- * We can't allow it to be toggled on live file systems or incorrect
- * behavior may be seen from cifs clients
- *
- * This property isn't registered via dsl_prop_register(), but this callback
- * will be called when a file system is first mounted
- */
-static void
-nbmand_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
- if (newval == FALSE) {
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
- } else {
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
- }
-}
-
-static void
-snapdir_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_show_ctldir = newval;
-}
-
-static void
-vscan_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_vscan = newval;
-}
-
-static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_acl_mode = newval;
-}
-
-static void
-acl_inherit_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_acl_inherit = newval;
-}
-
-static int
-zfs_register_callbacks(vfs_t *vfsp)
-{
- struct dsl_dataset *ds = NULL;
- objset_t *os = NULL;
- zfsvfs_t *zfsvfs = NULL;
- uint64_t nbmand;
- int readonly, do_readonly = B_FALSE;
- int setuid, do_setuid = B_FALSE;
- int exec, do_exec = B_FALSE;
- int devices, do_devices = B_FALSE;
- int xattr, do_xattr = B_FALSE;
- int atime, do_atime = B_FALSE;
- int error = 0;
-
- ASSERT(vfsp);
- zfsvfs = vfsp->vfs_data;
- ASSERT(zfsvfs);
- os = zfsvfs->z_os;
-
- /*
- * The act of registering our callbacks will destroy any mount
- * options we may have. In order to enable temporary overrides
- * of mount options, we stash away the current values and
- * restore them after we register the callbacks.
- */
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
- readonly = B_TRUE;
- do_readonly = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
- readonly = B_FALSE;
- do_readonly = B_TRUE;
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
- devices = B_FALSE;
- setuid = B_FALSE;
- do_devices = B_TRUE;
- do_setuid = B_TRUE;
- } else {
- if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
- devices = B_FALSE;
- do_devices = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
- devices = B_TRUE;
- do_devices = B_TRUE;
- }
-
- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
- setuid = B_FALSE;
- do_setuid = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
- setuid = B_TRUE;
- do_setuid = B_TRUE;
- }
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
- exec = B_FALSE;
- do_exec = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
- exec = B_TRUE;
- do_exec = B_TRUE;
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
- xattr = B_FALSE;
- do_xattr = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
- xattr = B_TRUE;
- do_xattr = B_TRUE;
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
- atime = B_FALSE;
- do_atime = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
- atime = B_TRUE;
- do_atime = B_TRUE;
- }
-
- /*
- * nbmand is a special property. It can only be changed at
- * mount time.
- *
- * This is weird, but it is documented to only be changeable
- * at mount time.
- */
- if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
- nbmand = B_FALSE;
- } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
- nbmand = B_TRUE;
- } else {
- char osname[MAXNAMELEN];
-
- dmu_objset_name(os, osname);
- if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
- NULL))
- return (error);
- }
-
- /*
- * Register property callbacks.
- *
- * It would probably be fine to just check for i/o error from
- * the first prop_register(), but I guess I like to go
- * overboard...
- */
- ds = dmu_objset_ds(os);
- error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "xattr", xattr_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "recordsize", blksz_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "readonly", readonly_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "devices", devices_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "setuid", setuid_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "exec", exec_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "snapdir", snapdir_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "aclmode", acl_mode_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "aclinherit", acl_inherit_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "vscan", vscan_changed_cb, zfsvfs);
- if (error)
- goto unregister;
-
- /*
- * Invoke our callbacks to restore temporary mount options.
- */
- if (do_readonly)
- readonly_changed_cb(zfsvfs, readonly);
- if (do_setuid)
- setuid_changed_cb(zfsvfs, setuid);
- if (do_exec)
- exec_changed_cb(zfsvfs, exec);
- if (do_devices)
- devices_changed_cb(zfsvfs, devices);
- if (do_xattr)
- xattr_changed_cb(zfsvfs, xattr);
- if (do_atime)
- atime_changed_cb(zfsvfs, atime);
-
- nbmand_changed_cb(zfsvfs, nbmand);
-
- return (0);
-
-unregister:
- /*
- * We may attempt to unregister some callbacks that are not
- * registered, but this is OK; it will simply return ENOMSG,
- * which we will ignore.
- */
- (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
- zfsvfs);
- (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
- return (error);
-
-}
-
-static int
-zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
-{
- uint_t readonly;
- int error;
-
- error = zfs_register_callbacks(zfsvfs->z_vfs);
- if (error)
- return (error);
-
- /*
- * Set the objset user_ptr to track its zfsvfs.
- */
- mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
- dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
-
- /*
- * If we are not mounting (ie: online recv), then we don't
- * have to worry about replaying the log as we blocked all
- * operations out since we closed the ZIL.
- */
- if (mounting) {
- /*
- * During replay we remove the read only flag to
- * allow replays to succeed.
- */
- readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
- if (readonly != 0)
- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
- else
- zfs_unlinked_drain(zfsvfs);
-
- /*
- * Parse and replay the intent log.
- *
- * Because of ziltest, this must be done after
- * zfs_unlinked_drain(). (Further note: ziltest doesn't
- * use readonly mounts, where zfs_unlinked_drain() isn't
- * called.) This is because ziltest causes spa_sync()
- * to think it's committed, but actually it is not, so
- * the intent log contains many txg's worth of changes.
- *
- * In particular, if object N is in the unlinked set in
- * the last txg to actually sync, then it could be
- * actually freed in a later txg and then reallocated in
- * a yet later txg. This would write a "create object
- * N" record to the intent log. Normally, this would be
- * fine because the spa_sync() would have written out
- * the fact that object N is free, before we could write
- * the "create object N" intent log record.
- *
- * But when we are in ziltest mode, we advance the "open
- * txg" without actually spa_sync()-ing the changes to
- * disk. So we would see that object N is still
- * allocated and in the unlinked set, and there is an
- * intent log record saying to allocate it.
- */
- zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
- zfs_replay_vector);
-
- zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
- }
-
- if (!zil_disable)
- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
- return (0);
-}
-
-static void
-zfs_freezfsvfs(zfsvfs_t *zfsvfs)
-{
- mutex_destroy(&zfsvfs->z_znodes_lock);
- mutex_destroy(&zfsvfs->z_online_recv_lock);
- list_destroy(&zfsvfs->z_all_znodes);
- rrw_destroy(&zfsvfs->z_teardown_lock);
- rw_destroy(&zfsvfs->z_teardown_inactive_lock);
- rw_destroy(&zfsvfs->z_fuid_lock);
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
-}
-
-static int
-zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
-{
- dev_t mount_dev;
- uint64_t recordsize, readonly;
- int error = 0;
- int mode;
- zfsvfs_t *zfsvfs;
- znode_t *zp = NULL;
-
- ASSERT(vfsp);
- ASSERT(osname);
-
- /*
- * Initialize the zfs-specific filesystem structure.
- * Should probably make this a kmem cache, shuffle fields,
- * and just bzero up to z_hold_mtx[].
- */
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
- zfsvfs->z_vfs = vfsp;
- zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_assign = TXG_NOWAIT;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
- zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-
- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
- offsetof(znode_t, z_link_node));
- rrw_init(&zfsvfs->z_teardown_lock);
- rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
-
- /* Initialize the generic filesystem structure. */
- vfsp->vfs_bcount = 0;
- vfsp->vfs_data = NULL;
-
- if (zfs_create_unique_device(&mount_dev) == -1) {
- error = ENODEV;
- goto out;
- }
- ASSERT(vfs_devismounted(mount_dev) == 0);
-
- if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
- NULL))
- goto out;
-
- vfsp->vfs_dev = mount_dev;
- vfsp->vfs_fstype = zfsfstype;
- vfsp->vfs_bsize = recordsize;
- vfsp->vfs_flag |= VFS_NOTRUNC;
- vfsp->vfs_data = zfsvfs;
-
- if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
- goto out;
-
- if (readonly)
- mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
- else
- mode = DS_MODE_PRIMARY;
-
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
- if (error == EROFS) {
- mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
- &zfsvfs->z_os);
- }
-
- if (error)
- goto out;
-
- if (error = zfs_init_fs(zfsvfs, &zp, cr))
- goto out;
-
- /* The call to zfs_init_fs leaves the vnode held, release it here. */
- VN_RELE(ZTOV(zp));
-
- /*
- * Set features for file system.
- */
- zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
- if (zfsvfs->z_use_fuids) {
- vfs_set_feature(vfsp, VFSFT_XVATTR);
- vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
- vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
- }
- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
- vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
- vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
- vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
- } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
- vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
- vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
- }
-
- if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
- uint64_t pval;
-
- ASSERT(mode & DS_MODE_READONLY);
- atime_changed_cb(zfsvfs, B_FALSE);
- readonly_changed_cb(zfsvfs, B_TRUE);
- if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
- goto out;
- xattr_changed_cb(zfsvfs, pval);
- zfsvfs->z_issnap = B_TRUE;
- } else {
- error = zfsvfs_setup(zfsvfs, B_TRUE);
- }
-
- if (!zfsvfs->z_issnap)
- zfsctl_create(zfsvfs);
-out:
- if (error) {
- if (zfsvfs->z_os)
- dmu_objset_close(zfsvfs->z_os);
- zfs_freezfsvfs(zfsvfs);
- } else {
- atomic_add_32(&zfs_active_fs_count, 1);
- }
-
- return (error);
-}
-
-void
-zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
-{
- objset_t *os = zfsvfs->z_os;
- struct dsl_dataset *ds;
-
- /*
- * Unregister properties.
- */
- if (!dmu_objset_is_snapshot(os)) {
- ds = dmu_objset_ds(os);
- VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclinherit",
- acl_inherit_changed_cb, zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "vscan",
- vscan_changed_cb, zfsvfs) == 0);
- }
-}
-
-/*
- * Convert a decimal digit string to a uint64_t integer.
- */
-static int
-str_to_uint64(char *str, uint64_t *objnum)
-{
- uint64_t num = 0;
-
- while (*str) {
- if (*str < '0' || *str > '9')
- return (EINVAL);
-
- num = num*10 + *str++ - '0';
- }
-
- *objnum = num;
- return (0);
-}
-
-/*
- * The boot path passed from the boot loader is in the form of
- * "rootpool-name/root-filesystem-object-number'. Convert this
- * string to a dataset name: "rootpool-name/root-filesystem-name".
- */
-static int
-zfs_parse_bootfs(char *bpath, char *outpath)
-{
- char *slashp;
- uint64_t objnum;
- int error;
-
- if (*bpath == 0 || *bpath == '/')
- return (EINVAL);
-
- slashp = strchr(bpath, '/');
-
- /* if no '/', just return the pool name */
- if (slashp == NULL) {
- (void) strcpy(outpath, bpath);
- return (0);
- }
-
- if (error = str_to_uint64(slashp+1, &objnum))
- return (error);
-
- *slashp = '\0';
- error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
- *slashp = '/';
-
- return (error);
-}
-
-static int
-zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
-{
- int error = 0;
- static int zfsrootdone = 0;
- zfsvfs_t *zfsvfs = NULL;
- znode_t *zp = NULL;
- vnode_t *vp = NULL;
- char *zfs_bootfs;
-
- ASSERT(vfsp);
-
- /*
- * The filesystem that we mount as root is defined in the
- * boot property "zfs-bootfs" with a format of
- * "poolname/root-dataset-objnum".
- */
- if (why == ROOT_INIT) {
- if (zfsrootdone++)
- return (EBUSY);
- /*
- * the process of doing a spa_load will require the
- * clock to be set before we could (for example) do
- * something better by looking at the timestamp on
- * an uberblock, so just set it to -1.
- */
- clkset(-1);
-
- if ((zfs_bootfs = spa_get_bootfs()) == NULL) {
- cmn_err(CE_NOTE, "\nspa_get_bootfs: can not get "
- "bootfs name \n");
- return (EINVAL);
- }
-
- if (error = spa_import_rootpool(rootfs.bo_name)) {
- spa_free_bootfs(zfs_bootfs);
- cmn_err(CE_NOTE, "\nspa_import_rootpool: error %d\n",
- error);
- return (error);
- }
-
- if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
- spa_free_bootfs(zfs_bootfs);
- cmn_err(CE_NOTE, "\nzfs_parse_bootfs: error %d\n",
- error);
- return (error);
- }
-
- spa_free_bootfs(zfs_bootfs);
-
- if (error = vfs_lock(vfsp))
- return (error);
-
- if (error = zfs_domount(vfsp, rootfs.bo_name, CRED())) {
- cmn_err(CE_NOTE, "\nzfs_domount: error %d\n", error);
- goto out;
- }
-
- zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
- ASSERT(zfsvfs);
- if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
- cmn_err(CE_NOTE, "\nzfs_zget: error %d\n", error);
- goto out;
- }
-
- vp = ZTOV(zp);
- mutex_enter(&vp->v_lock);
- vp->v_flag |= VROOT;
- mutex_exit(&vp->v_lock);
- rootvp = vp;
-
- /*
- * The zfs_zget call above returns with a hold on vp, we release
- * it here.
- */
- VN_RELE(vp);
-
- vfs_add((struct vnode *)0, vfsp,
- (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
-out:
- vfs_unlock(vfsp);
- return (error);
- } else if (why == ROOT_REMOUNT) {
- readonly_changed_cb(vfsp->vfs_data, B_FALSE);
- vfsp->vfs_flag |= VFS_REMOUNT;
-
- /* refresh mount options */
- zfs_unregister_callbacks(vfsp->vfs_data);
- return (zfs_register_callbacks(vfsp));
-
- } else if (why == ROOT_UNMOUNT) {
- zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
- (void) zfs_sync(vfsp, 0, 0);
- return (0);
- }
-
- /*
- * if "why" is equal to anything else other than ROOT_INIT,
- * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
- */
- return (ENOTSUP);
-}
-
-/*ARGSUSED*/
-static int
-zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
-{
- char *osname;
- pathname_t spn;
- int error = 0;
- uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
- UIO_SYSSPACE : UIO_USERSPACE;
- int canwrite;
-
- if (mvp->v_type != VDIR)
- return (ENOTDIR);
-
- mutex_enter(&mvp->v_lock);
- if ((uap->flags & MS_REMOUNT) == 0 &&
- (uap->flags & MS_OVERLAY) == 0 &&
- (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
- mutex_exit(&mvp->v_lock);
- return (EBUSY);
- }
- mutex_exit(&mvp->v_lock);
-
- /*
- * ZFS does not support passing unparsed data in via MS_DATA.
- * Users should use the MS_OPTIONSTR interface; this means
- * that all option parsing is already done and the options struct
- * can be interrogated.
- */
- if ((uap->flags & MS_DATA) && uap->datalen > 0)
- return (EINVAL);
-
- /*
- * Get the objset name (the "special" mount argument).
- */
- if (error = pn_get(uap->spec, fromspace, &spn))
- return (error);
-
- osname = spn.pn_path;
-
- /*
- * Check for mount privilege?
- *
- * If we don't have privilege then see if
- * we have local permission to allow it
- */
- error = secpolicy_fs_mount(cr, mvp, vfsp);
- if (error) {
- error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
- if (error == 0) {
- vattr_t vattr;
-
- /*
- * Make sure user is the owner of the mount point
- * or has sufficient privileges.
- */
-
- vattr.va_mask = AT_UID;
-
- if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
- goto out;
- }
-
- if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
- VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
- error = EPERM;
- goto out;
- }
-
- secpolicy_fs_mount_clearopts(cr, vfsp);
- } else {
- goto out;
- }
- }
-
- /*
- * Refuse to mount a filesystem if we are in a local zone and the
- * dataset is not visible.
- */
- if (!INGLOBALZONE(curproc) &&
- (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
- error = EPERM;
- goto out;
- }
-
- /*
- * When doing a remount, we simply refresh our temporary properties
- * according to those options set in the current VFS options.
- */
- if (uap->flags & MS_REMOUNT) {
- /* refresh mount options */
- zfs_unregister_callbacks(vfsp->vfs_data);
- error = zfs_register_callbacks(vfsp);
- goto out;
- }
-
- error = zfs_domount(vfsp, osname, cr);
-
-out:
- pn_free(&spn);
- return (error);
-}
-
-static int
-zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- dev32_t d32;
- uint64_t refdbytes, availbytes, usedobjs, availobjs;
-
- ZFS_ENTER(zfsvfs);
-
- dmu_objset_space(zfsvfs->z_os,
- &refdbytes, &availbytes, &usedobjs, &availobjs);
-
- /*
- * The underlying storage pool actually uses multiple block sizes.
- * We report the fragsize as the smallest block size we support,
- * and we report our blocksize as the filesystem's maximum blocksize.
- */
- statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
- statp->f_bsize = zfsvfs->z_max_blksz;
-
- /*
- * The following report "total" blocks of various kinds in the
- * file system, but reported in terms of f_frsize - the
- * "fragment" size.
- */
-
- statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
- statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
- statp->f_bavail = statp->f_bfree; /* no root reservation */
-
- /*
- * statvfs() should really be called statufs(), because it assumes
- * static metadata. ZFS doesn't preallocate files, so the best
- * we can do is report the max that could possibly fit in f_files,
- * and that minus the number actually used in f_ffree.
- * For f_ffree, report the smaller of the number of object available
- * and the number of blocks (each object will take at least a block).
- */
- statp->f_ffree = MIN(availobjs, statp->f_bfree);
- statp->f_favail = statp->f_ffree; /* no "root reservation" */
- statp->f_files = statp->f_ffree + usedobjs;
-
- (void) cmpldev(&d32, vfsp->vfs_dev);
- statp->f_fsid = d32;
-
- /*
- * We're a zfs filesystem.
- */
- (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
-
- statp->f_flag = vf_to_stf(vfsp->vfs_flag);
-
- statp->f_namemax = ZFS_MAXNAMELEN;
-
- /*
- * We have all of 32 characters to stuff a string here.
- * Is there anything useful we could/should provide?
- */
- bzero(statp->f_fstr, sizeof (statp->f_fstr));
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-static int
-zfs_root(vfs_t *vfsp, vnode_t **vpp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- znode_t *rootzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
- if (error == 0)
- *vpp = ZTOV(rootzp);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Teardown the zfsvfs::z_os.
- *
- * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
- * and 'z_teardown_inactive_lock' held.
- */
-static int
-zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
-{
- znode_t *zp;
-
- rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
-
- if (!unmounting) {
- /*
- * We purge the parent filesystem's vfsp as the parent
- * filesystem and all of its snapshots have their vnode's
- * v_vfsp set to the parent's filesystem's vfsp. Note,
- * 'z_parent' is self referential for non-snapshots.
- */
- (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
- }
-
- /*
- * Close the zil. NB: Can't close the zil while zfs_inactive
- * threads are blocked as zil_close can call zfs_inactive.
- */
- if (zfsvfs->z_log) {
- zil_close(zfsvfs->z_log);
- zfsvfs->z_log = NULL;
- }
-
- rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
-
- /*
- * If we are not unmounting (ie: online recv) and someone already
- * unmounted this file system while we were doing the switcheroo,
- * or a reopen of z_os failed then just bail out now.
- */
- if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
- return (EIO);
- }
-
- /*
- * At this point there are no vops active, and any new vops will
- * fail with EIO since we have z_teardown_lock for writer (only
- * relavent for forced unmount).
- *
- * Release all holds on dbufs.
- */
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
- zp = list_next(&zfsvfs->z_all_znodes, zp))
- if (zp->z_dbuf) {
- ASSERT(ZTOV(zp)->v_count > 0);
- zfs_znode_dmu_fini(zp);
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- /*
- * If we are unmounting, set the unmounted flag and let new vops
- * unblock. zfs_inactive will have the unmounted behavior, and all
- * other vops will fail with EIO.
- */
- if (unmounting) {
- zfsvfs->z_unmounted = B_TRUE;
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- }
-
- /*
- * z_os will be NULL if there was an error in attempting to reopen
- * zfsvfs, so just return as the properties had already been
- * unregistered and cached data had been evicted before.
- */
- if (zfsvfs->z_os == NULL)
- return (0);
-
- /*
- * Unregister properties.
- */
- zfs_unregister_callbacks(zfsvfs);
-
- /*
- * Evict cached data
- */
- if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
- }
-
- return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- objset_t *os;
- int ret;
-
- ret = secpolicy_fs_unmount(cr, vfsp);
- if (ret) {
- ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
- ZFS_DELEG_PERM_MOUNT, cr);
- if (ret)
- return (ret);
- }
-
- /*
- * We purge the parent filesystem's vfsp as the parent filesystem
- * and all of its snapshots have their vnode's v_vfsp set to the
- * parent's filesystem's vfsp. Note, 'z_parent' is self
- * referential for non-snapshots.
- */
- (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
-
- /*
- * Unmount any snapshots mounted under .zfs before unmounting the
- * dataset itself.
- */
- if (zfsvfs->z_ctldir != NULL &&
- (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
- return (ret);
- }
-
- if (!(fflag & MS_FORCE)) {
- /*
- * Check the number of active vnodes in the file system.
- * Our count is maintained in the vfs structure, but the
- * number is off by 1 to indicate a hold on the vfs
- * structure itself.
- *
- * The '.zfs' directory maintains a reference of its
- * own, and any active references underneath are
- * reflected in the vnode count.
- */
- if (zfsvfs->z_ctldir == NULL) {
- if (vfsp->vfs_count > 1)
- return (EBUSY);
- } else {
- if (vfsp->vfs_count > 2 ||
- zfsvfs->z_ctldir->v_count > 1)
- return (EBUSY);
- }
- }
-
- vfsp->vfs_flag |= VFS_UNMOUNTED;
-
- VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
- os = zfsvfs->z_os;
-
- /*
- * z_os will be NULL if there was an error in
- * attempting to reopen zfsvfs.
- */
- if (os != NULL) {
- /*
- * Unset the objset user_ptr.
- */
- mutex_enter(&os->os->os_user_ptr_lock);
- dmu_objset_set_user(os, NULL);
- mutex_exit(&os->os->os_user_ptr_lock);
-
- /*
- * Finally close the objset
- */
- dmu_objset_close(os);
- }
-
- /*
- * We can now safely destroy the '.zfs' directory node.
- */
- if (zfsvfs->z_ctldir != NULL)
- zfsctl_destroy(zfsvfs);
-
- return (0);
-}
-
-static int
-zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- znode_t *zp;
- uint64_t object = 0;
- uint64_t fid_gen = 0;
- uint64_t gen_mask;
- uint64_t zp_gen;
- int i, err;
-
- *vpp = NULL;
-
- ZFS_ENTER(zfsvfs);
-
- if (fidp->fid_len == LONG_FID_LEN) {
- zfid_long_t *zlfid = (zfid_long_t *)fidp;
- uint64_t objsetid = 0;
- uint64_t setgen = 0;
-
- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
- objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
-
- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
- setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
-
- ZFS_EXIT(zfsvfs);
-
- err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
- if (err)
- return (EINVAL);
- ZFS_ENTER(zfsvfs);
- }
-
- if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
- zfid_short_t *zfid = (zfid_short_t *)fidp;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
-
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
- } else {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /* A zero fid_gen means we are in the .zfs control directories */
- if (fid_gen == 0 &&
- (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
- *vpp = zfsvfs->z_ctldir;
- ASSERT(*vpp != NULL);
- if (object == ZFSCTL_INO_SNAPDIR) {
- VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
- 0, NULL, NULL, NULL, NULL, NULL) == 0);
- } else {
- VN_HOLD(*vpp);
- }
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- gen_mask = -1ULL >> (64 - 8 * i);
-
- dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
- if (err = zfs_zget(zfsvfs, object, &zp)) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- zp_gen = zp->z_phys->zp_gen & gen_mask;
- if (zp_gen == 0)
- zp_gen = 1;
- if (zp->z_unlinked || zp_gen != fid_gen) {
- dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
- VN_RELE(ZTOV(zp));
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- *vpp = ZTOV(zp);
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Block out VOPs and close zfsvfs_t::z_os
- *
- * Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.
- */
-int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
-{
- int error;
-
- if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
- return (error);
-
- *mode = zfsvfs->z_os->os_mode;
- dmu_objset_name(zfsvfs->z_os, name);
- dmu_objset_close(zfsvfs->z_os);
-
- return (0);
-}
-
-/*
- * Reopen zfsvfs_t::z_os and release VOPs.
- */
-int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
-{
- int err;
-
- ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
- ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
-
- err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
- if (err) {
- zfsvfs->z_os = NULL;
- } else {
- znode_t *zp;
-
- VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
-
- /*
- * Attempt to re-establish all the active znodes with
- * their dbufs. If a zfs_rezget() fails, then we'll let
- * any potential callers discover that via ZFS_ENTER_VERIFY_VP
- * when they try to use their znode.
- */
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp;
- zp = list_next(&zfsvfs->z_all_znodes, zp)) {
- (void) zfs_rezget(zp);
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- }
-
- /* release the VOPs */
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
-
- if (err) {
- /*
- * Since we couldn't reopen zfsvfs::z_os, force
- * unmount this file system.
- */
- if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
- (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
- }
- return (err);
-}
-
-static void
-zfs_freevfs(vfs_t *vfsp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- int i;
-
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-
- zfs_fuid_destroy(zfsvfs);
- zfs_freezfsvfs(zfsvfs);
-
- atomic_add_32(&zfs_active_fs_count, -1);
-}
-
-/*
- * VFS_INIT() initialization. Note that there is no VFS_FINI(),
- * so we can't safely do any non-idempotent initialization here.
- * Leave that to zfs_init() and zfs_fini(), which are called
- * from the module's _init() and _fini() entry points.
- */
-/*ARGSUSED*/
-static int
-zfs_vfsinit(int fstype, char *name)
-{
- int error;
-
- zfsfstype = fstype;
-
- /*
- * Setup vfsops and vnodeops tables.
- */
- error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
- if (error != 0) {
- cmn_err(CE_WARN, "zfs: bad vfs ops template");
- }
-
- error = zfs_create_op_tables();
- if (error) {
- zfs_remove_op_tables();
- cmn_err(CE_WARN, "zfs: bad vnode ops template");
- (void) vfs_freevfsops_by_type(zfsfstype);
- return (error);
- }
-
- mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- /*
- * Unique major number for all zfs mounts.
- * If we run out of 32-bit minors, we'll getudev() another major.
- */
- zfs_major = ddi_name_to_major(ZFS_DRIVER);
- zfs_minor = ZFS_MIN_MINOR;
-
- return (0);
-}
-
-void
-zfs_init(void)
-{
- /*
- * Initialize .zfs directory structures
- */
- zfsctl_init();
-
- /*
- * Initialize znode cache, vnode ops, etc...
- */
- zfs_znode_init();
-}
-
-void
-zfs_fini(void)
-{
- zfsctl_fini();
- zfs_znode_fini();
-}
-
-int
-zfs_busy(void)
-{
- return (zfs_active_fs_count != 0);
-}
-
-int
-zfs_set_version(const char *name, uint64_t newvers)
-{
- int error;
- objset_t *os;
- dmu_tx_t *tx;
- uint64_t curvers;
-
- /*
- * XXX for now, require that the filesystem be unmounted. Would
- * be nice to find the zfsvfs_t and just update that if
- * possible.
- */
-
- if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
- return (EINVAL);
-
- error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_PRIMARY, &os);
- if (error)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
- 8, 1, &curvers);
- if (error)
- goto out;
- if (newvers < curvers) {
- error = EINVAL;
- goto out;
- }
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- goto out;
- }
- error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
- &newvers, tx);
-
- spa_history_internal_log(LOG_DS_UPGRADE,
- dmu_objset_spa(os), tx, CRED(),
- "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
- dmu_objset_id(os));
- dmu_tx_commit(tx);
-
-out:
- dmu_objset_close(os);
- return (error);
-}
-
-/*
- * Read a property stored within the master node.
- */
-int
-zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
-{
- const char *pname;
- int error;
-
- /*
- * Look up the file system's value for the property. For the
- * version property, we look up a slightly different string.
- */
- if (prop == ZFS_PROP_VERSION)
- pname = ZPL_VERSION_STR;
- else
- pname = zfs_prop_to_name(prop);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
-
- if (error == ENOENT) {
- /* No value set, use the default value */
- switch (prop) {
- case ZFS_PROP_VERSION:
- *value = ZPL_VERSION;
- break;
- case ZFS_PROP_NORMALIZE:
- case ZFS_PROP_UTF8ONLY:
- *value = 0;
- break;
- case ZFS_PROP_CASE:
- *value = ZFS_CASE_SENSITIVE;
- break;
- default:
- return (error);
- }
- error = 0;
- }
- return (error);
-}
-
-static vfsdef_t vfw = {
- VFSDEF_VERSION,
- MNTTYPE_ZFS,
- zfs_vfsinit,
- VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
- VSW_XID,
- &zfs_mntopts
-};
-
-struct modlfs zfs_modlfs = {
- &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
-};
diff --git a/zfs/lib/libdmu-ctl/zfs_vnops.c b/zfs/lib/libdmu-ctl/zfs_vnops.c
deleted file mode 100644
index 3f36328de..000000000
--- a/zfs/lib/libdmu-ctl/zfs_vnops.c
+++ /dev/null
@@ -1,4558 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-
-#pragma ident "@(#)zfs_vnops.c 1.73 08/04/27 SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/uio.h>
-#include <sys/vmsystm.h>
-#include <sys/atomic.h>
-#include <sys/vm.h>
-#include <vm/seg_vn.h>
-#include <vm/pvn.h>
-#include <vm/as.h>
-#include <sys/mman.h>
-#include <sys/pathname.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/dbuf.h>
-#include <sys/zap.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/sunddi.h>
-#include <sys/filio.h>
-#include "fs/fs_subr.h"
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/dnlc.h>
-#include <sys/zfs_rlock.h>
-#include <sys/extdirent.h>
-#include <sys/kidmap.h>
-#include <sys/cred_impl.h>
-#include <sys/attr.h>
-
-/*
- * Programming rules.
- *
- * Each vnode op performs some logical unit of work. To do this, the ZPL must
- * properly lock its in-core state, create a DMU transaction, do the work,
- * record this work in the intent log (ZIL), commit the DMU transaction,
- * and wait for the intent log to commit if it is a synchronous operation.
- * Moreover, the vnode ops must work in both normal and log replay context.
- * The ordering of events is important to avoid deadlocks and references
- * to freed memory. The example below illustrates the following Big Rules:
- *
- * (1) A check must be made in each zfs thread for a mounted file system.
- * This is done avoiding races using ZFS_ENTER(zfsvfs).
- * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
- * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
- * can return EIO from the calling function.
- *
- * (2) VN_RELE() should always be the last thing except for zil_commit()
- * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
- * First, if it's the last reference, the vnode/znode
- * can be freed, so the zp may point to freed memory. Second, the last
- * reference will call zfs_zinactive(), which may induce a lot of work --
- * pushing cached pages (which acquires range locks) and syncing out
- * cached atime changes. Third, zfs_zinactive() may require a new tx,
- * which could deadlock the system if you were already holding one.
- *
- * (3) All range locks must be grabbed before calling dmu_tx_assign(),
- * as they can span dmu_tx_assign() calls.
- *
- * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
- * In normal operation, this will be TXG_NOWAIT. During ZIL replay,
- * it will be a specific txg. Either way, dmu_tx_assign() never blocks.
- * This is critical because we don't want to block while holding locks.
- * Note, in particular, that if a lock is sometimes acquired before
- * the tx assigns, and sometimes after (e.g. z_lock), then failing to
- * use a non-blocking assign can deadlock the system. The scenario:
- *
- * Thread A has grabbed a lock before calling dmu_tx_assign().
- * Thread B is in an already-assigned tx, and blocks for this lock.
- * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
- * forever, because the previous txg can't quiesce until B's tx commits.
- *
- * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- * then drop all locks, call dmu_tx_wait(), and try again.
- *
- * (5) If the operation succeeded, generate the intent log entry for it
- * before dropping locks. This ensures that the ordering of events
- * in the intent log matches the order in which they actually occurred.
- *
- * (6) At the end of each vnode op, the DMU tx must always commit,
- * regardless of whether there were any errors.
- *
- * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
- * to ensure that synchronous semantics are provided when necessary.
- *
- * In general, this is how things should be ordered in each vnode op:
- *
- * ZFS_ENTER(zfsvfs); // exit if unmounted
- * top:
- * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
- * rw_enter(...); // grab any other locks you need
- * tx = dmu_tx_create(...); // get DMU tx
- * dmu_tx_hold_*(); // hold each object you might modify
- * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
- * if (error) {
- * rw_exit(...); // drop locks
- * zfs_dirent_unlock(dl); // unlock directory entry
- * VN_RELE(...); // release held vnodes
- * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- * dmu_tx_wait(tx);
- * dmu_tx_abort(tx);
- * goto top;
- * }
- * dmu_tx_abort(tx); // abort DMU tx
- * ZFS_EXIT(zfsvfs); // finished in zfs
- * return (error); // really out of space
- * }
- * error = do_real_work(); // do whatever this VOP does
- * if (error == 0)
- * zfs_log_*(...); // on success, make ZIL entry
- * dmu_tx_commit(tx); // commit DMU tx -- error or not
- * rw_exit(...); // drop locks
- * zfs_dirent_unlock(dl); // unlock directory entry
- * VN_RELE(...); // release held vnodes
- * zil_commit(zilog, seq, foid); // synchronous when necessary
- * ZFS_EXIT(zfsvfs); // finished in zfs
- * return (error); // done, report error
- */
-
-/* ARGSUSED */
-static int
-zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(*vpp);
-
- if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
- ((flag & FAPPEND) == 0)) {
- return (EPERM);
- }
-
- if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
- ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0)
- if (fs_vscan(*vpp, cr, 0) != 0)
- return (EACCES);
-
- /* Keep a count of the synchronous opens in the znode */
- if (flag & (FSYNC | FDSYNC))
- atomic_inc_32(&zp->z_sync_cnt);
-
- return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
-
- /* Decrement the synchronous opens in the znode */
- if ((flag & (FSYNC | FDSYNC)) && (count == 1))
- atomic_dec_32(&zp->z_sync_cnt);
-
- /*
- * Clean up any locks held by this process on the vp.
- */
- cleanlocks(vp, ddi_get_pid(), 0);
- cleanshares(vp, ddi_get_pid());
-
- if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
- ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0)
- VERIFY(fs_vscan(vp, cr, 1) == 0);
-
- return (0);
-}
-
-/*
- * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
- * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey(vnode_t *vp, int cmd, offset_t *off)
-{
- znode_t *zp = VTOZ(vp);
- uint64_t noff = (uint64_t)*off; /* new offset */
- uint64_t file_sz;
- int error;
- boolean_t hole;
-
- file_sz = zp->z_phys->zp_size;
- if (noff >= file_sz) {
- return (ENXIO);
- }
-
- if (cmd == _FIO_SEEK_HOLE)
- hole = B_TRUE;
- else
- hole = B_FALSE;
-
- error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
-
- /* end of file? */
- if ((error == ESRCH) || (noff > file_sz)) {
- /*
- * Handle the virtual hole at the end of file.
- */
- if (hole) {
- *off = file_sz;
- return (0);
- }
- return (ENXIO);
- }
-
- if (noff < *off)
- return (error);
- *off = noff;
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
- int *rvalp, caller_context_t *ct)
-{
- offset_t off;
- int error;
- zfsvfs_t *zfsvfs;
- znode_t *zp;
-
- switch (com) {
- case _FIOFFS:
- return (zfs_sync(vp->v_vfsp, 0, cred));
-
- /*
- * The following two ioctls are used by bfu. Faking out,
- * necessary to avoid bfu errors.
- */
- case _FIOGDIO:
- case _FIOSDIO:
- return (0);
-
- case _FIO_SEEK_DATA:
- case _FIO_SEEK_HOLE:
- if (ddi_copyin((void *)data, &off, sizeof (off), flag))
- return (EFAULT);
-
- zp = VTOZ(vp);
- zfsvfs = zp->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /* offset parameter is in/out */
- error = zfs_holey(vp, com, &off);
- ZFS_EXIT(zfsvfs);
- if (error)
- return (error);
- if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
- return (EFAULT);
- return (0);
- }
- return (ENOTTY);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Write: If we find a memory mapped page, we write to *both*
- * the page and the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
- */
-static int
-mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int64_t start, off;
- int len = nbytes;
- int error = 0;
-
- start = uio->uio_loffset;
- off = start & PAGEOFFSET;
- for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
- page_t *pp;
- uint64_t bytes = MIN(PAGESIZE - off, len);
- uint64_t woff = uio->uio_loffset;
-
- /*
- * We don't want a new page to "appear" in the middle of
- * the file update (because it may not get the write
- * update data), so we grab a lock to block
- * zfs_getpage().
- */
- rw_enter(&zp->z_map_lock, RW_WRITER);
- if (pp = page_lookup(vp, start, SE_SHARED)) {
- caddr_t va;
-
- rw_exit(&zp->z_map_lock);
- va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
- error = uiomove(va+off, bytes, UIO_WRITE, uio);
- if (error == 0) {
- dmu_write(zfsvfs->z_os, zp->z_id,
- woff, bytes, va+off, tx);
- }
- ppmapout(va);
- page_unlock(pp);
- } else {
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
- uio, bytes, tx);
- rw_exit(&zp->z_map_lock);
- }
- len -= bytes;
- off = 0;
- if (error)
- break;
- }
- return (error);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Read: We "read" preferentially from memory mapped pages,
- * else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
- */
-static int
-mappedread(vnode_t *vp, int nbytes, uio_t *uio)
-{
- znode_t *zp = VTOZ(vp);
- objset_t *os = zp->z_zfsvfs->z_os;
- int64_t start, off;
- int len = nbytes;
- int error = 0;
-
- start = uio->uio_loffset;
- off = start & PAGEOFFSET;
- for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
- page_t *pp;
- uint64_t bytes = MIN(PAGESIZE - off, len);
-
- if (pp = page_lookup(vp, start, SE_SHARED)) {
- caddr_t va;
-
- va = ppmapin(pp, PROT_READ, (caddr_t)-1L);
- error = uiomove(va + off, bytes, UIO_READ, uio);
- ppmapout(va);
- page_unlock(pp);
- } else {
- error = dmu_read_uio(os, zp->z_id, uio, bytes);
- }
- len -= bytes;
- off = 0;
- if (error)
- break;
- }
- return (error);
-}
-
-offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
-
-/*
- * Read bytes from specified file into supplied buffer.
- *
- * IN: vp - vnode of file to be read from.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * ioflag - SYNC flags; used to provide FRSYNC semantics.
- * cr - credentials of caller.
- * ct - caller context
- *
- * OUT: uio - updated offset and range, buffer filled.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Side Effects:
- * vp - atime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os;
- ssize_t n, nbytes;
- int error;
- rl_t *rl;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- os = zfsvfs->z_os;
-
- if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
- ZFS_EXIT(zfsvfs);
- return (EACCES);
- }
-
- /*
- * Validate file offset
- */
- if (uio->uio_loffset < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * Fasttrack empty reads
- */
- if (uio->uio_resid == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /*
- * Check for mandatory locks
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
- if (error = chklock(vp, FREAD,
- uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- /*
- * If we're in FRSYNC mode, sync out this znode before reading it.
- */
- if (ioflag & FRSYNC)
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
-
- /*
- * Lock the range against changes.
- */
- rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
-
- /*
- * If we are reading past end-of-file we can skip
- * to the end; but we might still need to set atime.
- */
- if (uio->uio_loffset >= zp->z_phys->zp_size) {
- error = 0;
- goto out;
- }
-
- ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
- n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
-
- while (n > 0) {
- nbytes = MIN(n, zfs_read_chunk_size -
- P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
- if (vn_has_cached_data(vp))
- error = mappedread(vp, nbytes, uio);
- else
- error = dmu_read_uio(os, zp->z_id, uio, nbytes);
- if (error)
- break;
-
- n -= nbytes;
- }
-
-out:
- zfs_range_unlock(rl);
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified.
- * Any error will exit this routine as this is only a best
- * attempt to get the pages resident. This is a copy of ufs_trans_touch().
- */
-static void
-zfs_prefault_write(ssize_t n, struct uio *uio)
-{
- struct iovec *iov;
- ulong_t cnt, incr;
- caddr_t p;
- uint8_t tmp;
-
- iov = uio->uio_iov;
-
- while (n) {
- cnt = MIN(iov->iov_len, n);
- if (cnt == 0) {
- /* empty iov entry */
- iov++;
- continue;
- }
- n -= cnt;
- /*
- * touch each page in this segment.
- */
- p = iov->iov_base;
- while (cnt) {
- switch (uio->uio_segflg) {
- case UIO_USERSPACE:
- case UIO_USERISPACE:
- if (fuword8(p, &tmp))
- return;
- break;
- case UIO_SYSSPACE:
- if (kcopy(p, &tmp, 1))
- return;
- break;
- }
- incr = MIN(cnt, PAGESIZE);
- p += incr;
- cnt -= incr;
- }
- /*
- * touch the last byte in case it straddles a page.
- */
- p--;
- switch (uio->uio_segflg) {
- case UIO_USERSPACE:
- case UIO_USERISPACE:
- if (fuword8(p, &tmp))
- return;
- break;
- case UIO_SYSSPACE:
- if (kcopy(p, &tmp, 1))
- return;
- break;
- }
- iov++;
- }
-}
-
-/*
- * Write the bytes to a file.
- *
- * IN: vp - vnode of file to be written to.
- * uio - structure supplying write location, range info,
- * and data buffer.
- * ioflag - FAPPEND flag set if in append mode.
- * cr - credentials of caller.
- * ct - caller context (NFS/CIFS fem monitor only)
- *
- * OUT: uio - updated offset and range.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - ctime|mtime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- rlim64_t limit = uio->uio_llimit;
- ssize_t start_resid = uio->uio_resid;
- ssize_t tx_bytes;
- uint64_t end_size;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog;
- offset_t woff;
- ssize_t n, nbytes;
- rl_t *rl;
- int max_blksz = zfsvfs->z_max_blksz;
- uint64_t pflags = zp->z_phys->zp_flags;
- int error;
-
- /*
- * If immutable or not appending then return EPERM
- */
- if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
- ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
- (uio->uio_loffset < zp->z_phys->zp_size)))
- return (EPERM);
-
- /*
- * Fasttrack empty write
- */
- n = start_resid;
- if (n == 0)
- return (0);
-
- if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
- limit = MAXOFFSET_T;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zilog = zfsvfs->z_log;
-
- /*
- * Pre-fault the pages to ensure slow (eg NFS) pages
- * don't hold up txg.
- */
- zfs_prefault_write(n, uio);
-
- /*
- * If in append mode, set the io offset pointer to eof.
- */
- if (ioflag & FAPPEND) {
- /*
- * Range lock for a file append:
- * The value for the start of range will be determined by
- * zfs_range_lock() (to guarantee append semantics).
- * If this write will cause the block size to increase,
- * zfs_range_lock() will lock the entire file, so we must
- * later reduce the range after we grow the block size.
- */
- rl = zfs_range_lock(zp, 0, n, RL_APPEND);
- if (rl->r_len == UINT64_MAX) {
- /* overlocked, zp_size can't change */
- woff = uio->uio_loffset = zp->z_phys->zp_size;
- } else {
- woff = uio->uio_loffset = rl->r_off;
- }
- } else {
- woff = uio->uio_loffset;
- /*
- * Validate file offset
- */
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * If we need to grow the block size then zfs_range_lock()
- * will lock a wider range than we request here.
- * Later after growing the block size we reduce the range.
- */
- rl = zfs_range_lock(zp, woff, n, RL_WRITER);
- }
-
- if (woff >= limit) {
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (EFBIG);
- }
-
- if ((woff + n) > limit || woff > (limit - n))
- n = limit - woff;
-
- /*
- * Check for mandatory locks
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
- (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- end_size = MAX(zp->z_phys->zp_size, woff + n);
-
- /*
- * Write the file in reasonable size chunks. Each chunk is written
- * in a separate transaction; this keeps the intent log records small
- * and allows us to do more fine-grained space accounting.
- */
- while (n > 0) {
- /*
- * Start a transaction.
- */
- woff = uio->uio_loffset;
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- continue;
- }
- dmu_tx_abort(tx);
- break;
- }
-
- /*
- * If zfs_range_lock() over-locked we grow the blocksize
- * and then reduce the lock range. This will only happen
- * on the first iteration since zfs_range_reduce() will
- * shrink down r_len to the appropriate size.
- */
- if (rl->r_len == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
- zfs_range_reduce(rl, woff, n);
- }
-
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
- rw_enter(&zp->z_map_lock, RW_READER);
-
- tx_bytes = uio->uio_resid;
- if (vn_has_cached_data(vp)) {
- rw_exit(&zp->z_map_lock);
- error = mappedwrite(vp, nbytes, uio, tx);
- } else {
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
- uio, nbytes, tx);
- rw_exit(&zp->z_map_lock);
- }
- tx_bytes -= uio->uio_resid;
-
- /*
- * If we made no progress, we're done. If we made even
- * partial progress, update the znode and ZIL accordingly.
- */
- if (tx_bytes == 0) {
- dmu_tx_commit(tx);
- ASSERT(error != 0);
- break;
- }
-
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the excute bits is set.
- *
- * It would be nice to to this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- *
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
- */
- mutex_enter(&zp->z_acl_lock);
- if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
- (zp->z_phys->zp_mode & S_ISUID) != 0 &&
- zp->z_phys->zp_uid == 0) != 0) {
- zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
- }
- mutex_exit(&zp->z_acl_lock);
-
- /*
- * Update time stamp. NOTE: This marks the bonus buffer as
- * dirty, so we don't have to do it again for zp_size.
- */
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-
- /*
- * Update the file size (zp_size) if it has changed;
- * account for possible concurrent updates.
- */
- while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
- (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
- uio->uio_loffset);
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
- dmu_tx_commit(tx);
-
- if (error != 0)
- break;
- ASSERT(tx_bytes == nbytes);
- n -= nbytes;
- }
-
- zfs_range_unlock(rl);
-
- /*
- * If we're in replay mode, or we made no progress, return error.
- * Otherwise, it's at least a partial write, so it's successful.
- */
- if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (ioflag & (FSYNC | FDSYNC))
- zil_commit(zilog, zp->z_last_itx, zp->z_id);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
-{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
- vnode_t *vp = ZTOV(rl->r_zp);
-
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- VN_RELE(vp);
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
-{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t off = lr->lr_offset;
- dmu_buf_t *db;
- rl_t *rl;
- zgd_t *zgd;
- int dlen = lr->lr_length; /* length of user data */
- int error = 0;
-
- ASSERT(zio);
- ASSERT(dlen != 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
- return (ENOENT);
- if (zp->z_unlinked) {
- VN_RELE(ZTOV(zp));
- return (ENOENT);
- }
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- rl = zfs_range_lock(zp, off, dlen, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
- error = ENOENT;
- goto out;
- }
- VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
- } else { /* indirect write */
- uint64_t boff; /* block starting offset */
-
- /*
- * Have to lock the whole block to ensure when it's
- * written out and it's checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- if (ISP2(zp->z_blksz)) {
- boff = P2ALIGN_TYPED(off, zp->z_blksz,
- uint64_t);
- } else {
- boff = 0;
- }
- dlen = zp->z_blksz;
- rl = zfs_range_lock(zp, boff, dlen, RL_READER);
- if (zp->z_blksz == dlen)
- break;
- zfs_range_unlock(rl);
- }
- /* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
- error = ENOENT;
- goto out;
- }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_rl = rl;
- zgd->zgd_zilog = zfsvfs->z_log;
- zgd->zgd_bp = &lr->lr_blkptr;
- VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
- ASSERT(boff == db->db_offset);
- lr->lr_blkoff = off - boff;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zfs_get_done, zgd);
- ASSERT((error && error != EINPROGRESS) ||
- lr->lr_length <= zp->z_blksz);
- if (error == 0)
- zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zfs_get_done() callback.
- */
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
- kmem_free(zgd, sizeof (zgd_t));
- }
-out:
- zfs_range_unlock(rl);
- VN_RELE(ZTOV(zp));
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (flag & V_ACE_MASK)
- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
- else
- error = zfs_zaccess_rwx(zp, mode, flag, cr);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Lookup an entry in a directory, or an extended attribute directory.
- * If it exists, return a held vnode reference for it.
- *
- * IN: dvp - vnode of directory to search.
- * nm - name of entry to lookup.
- * pnp - full pathname to lookup [UNUSED].
- * flags - LOOKUP_XATTR set if looking for an attribute.
- * rdir - root directory vnode [UNUSED].
- * cr - credentials of caller.
- * ct - caller context
- * direntflags - directory lookup flags
- * realpnp - returned pathname.
- *
- * OUT: vpp - vnode of located entry, NULL if not found.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * NA
- */
-/* ARGSUSED */
-static int
-zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp)
-{
- znode_t *zdp = VTOZ(dvp);
- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zdp);
-
- *vpp = NULL;
-
- if (flags & LOOKUP_XATTR) {
- /*
- * If the xattr property is off, refuse the lookup request.
- */
- if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * We don't allow recursive attributes..
- * Maybe someday we will.
- */
- if (zdp->z_phys->zp_flags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Do we have permission to get into attribute directory?
- */
-
- if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
- B_FALSE, cr)) {
- VN_RELE(*vpp);
- *vpp = NULL;
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (dvp->v_type != VDIR) {
- ZFS_EXIT(zfsvfs);
- return (ENOTDIR);
- }
-
- /*
- * Check accessibility of directory.
- */
-
- if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (EILSEQ);
- }
-
- error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
- if (error == 0) {
- /*
- * Convert device special files
- */
- if (IS_DEVVP(*vpp)) {
- vnode_t *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL)
- error = ENOSYS;
- else
- *vpp = svp;
- }
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Attempt to create a new entry in a directory. If the entry
- * already exists, truncate the file if permissible, else return
- * an error. Return the vp of the created or trunc'd file.
- *
- * IN: dvp - vnode of directory to put new file entry in.
- * name - name of new file entry.
- * vap - attributes of new file.
- * excl - flag indicating exclusive or non-exclusive mode.
- * mode - mode to open file with.
- * cr - credentials of caller.
- * flag - large file flag [UNUSED].
- * ct - caller context
- * vsecp - ACL to be set
- *
- * OUT: vpp - vnode of created or trunc'd entry.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated if new entry created
- * vp - ctime|mtime always, atime if new
- */
-
-/* ARGSUSED */
-static int
-zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
- int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
- vsecattr_t *vsecp)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog;
- objset_t *os;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
- zfs_acl_t *aclp = NULL;
- zfs_fuid_info_t *fuidp = NULL;
-
- /*
- * If we have an ephemeral id, ACL, or XVATTR then
- * make sure file system is at proper version
- */
-
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || (vap->va_mask & AT_XVATTR) ||
- IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- os = zfsvfs->z_os;
- zilog = zfsvfs->z_log;
-
- if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (EILSEQ);
- }
-
- if (vap->va_mask & AT_XVATTR) {
- if ((error = secpolicy_xvattr((xvattr_t *)vap,
- crgetuid(cr), cr, vap->va_type)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-top:
- *vpp = NULL;
-
- if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
- vap->va_mode &= ~VSVTX;
-
- if (*name == '\0') {
- /*
- * Null component name refers to the directory itself.
- */
- VN_HOLD(dvp);
- zp = dzp;
- dl = NULL;
- error = 0;
- } else {
- /* possible VN_HOLD(zp) */
- int zflg = 0;
-
- if (flag & FIGNORECASE)
- zflg |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL);
- if (error) {
- if (strcmp(name, "..") == 0)
- error = EISDIR;
- ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
- return (error);
- }
- }
- if (vsecp && aclp == NULL) {
- error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- if (dl)
- zfs_dirent_unlock(dl);
- return (error);
- }
- }
-
- if (zp == NULL) {
- uint64_t txtype;
-
- /*
- * Create a new file object and update the directory
- * to reference it.
- */
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- goto out;
- }
-
- /*
- * We only support the creation of regular files in
- * extended attribute directories.
- */
- if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
- (vap->va_type != VREG)) {
- error = EINVAL;
- goto out;
- }
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
- IS_EPHEMERAL(crgetgid(cr))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
- FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
- dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
- }
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
- return (error);
- }
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
- (void) zfs_link_create(dl, zp, tx, ZNEW);
- txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
- if (flag & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, name,
- vsecp, fuidp, vap);
- if (fuidp)
- zfs_fuid_info_free(fuidp);
- dmu_tx_commit(tx);
- } else {
- int aflags = (flag & FAPPEND) ? V_APPEND : 0;
-
- /*
- * A directory entry already exists for this name.
- */
- /*
- * Can't truncate an existing file if in exclusive mode.
- */
- if (excl == EXCL) {
- error = EEXIST;
- goto out;
- }
- /*
- * Can't open a directory for writing.
- */
- if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
- error = EISDIR;
- goto out;
- }
- /*
- * Verify requested access to file.
- */
- if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
- goto out;
- }
-
- mutex_enter(&dzp->z_lock);
- dzp->z_seq++;
- mutex_exit(&dzp->z_lock);
-
- /*
- * Truncate regular files if requested.
- */
- if ((ZTOV(zp)->v_type == VREG) &&
- (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
- error = zfs_freesp(zp, 0, 0, mode, TRUE);
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- /* NB: we already did dmu_tx_wait() */
- zfs_dirent_unlock(dl);
- VN_RELE(ZTOV(zp));
- goto top;
- }
-
- if (error == 0) {
- vnevent_create(ZTOV(zp), ct);
- }
- }
- }
-out:
-
- if (dl)
- zfs_dirent_unlock(dl);
-
- if (error) {
- if (zp)
- VN_RELE(ZTOV(zp));
- } else {
- *vpp = ZTOV(zp);
- /*
- * If vnode is for a device return a specfs vnode instead.
- */
- if (IS_DEVVP(*vpp)) {
- struct vnode *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL) {
- error = ENOSYS;
- }
- *vpp = svp;
- }
- }
- if (aclp)
- zfs_acl_free(aclp);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Remove an entry from a directory.
- *
- * IN: dvp - vnode of directory to remove entry from.
- * name - name of entry to remove.
- * cr - credentials of caller.
- * ct - caller context
- * flags - case flags
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime
- * vp - ctime (if nlink > 0)
- */
-/*ARGSUSED*/
-static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
- int flags)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- znode_t *xzp = NULL;
- vnode_t *vp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog;
- uint64_t acl_obj, xattr_obj;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- boolean_t may_delete_now, delete_now = FALSE;
- boolean_t unlinked;
- uint64_t txtype;
- pathname_t *realnmp = NULL;
- pathname_t realnm;
- int error;
- int zflg = ZEXISTS;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (flags & FIGNORECASE) {
- zflg |= ZCILOOK;
- pn_alloc(&realnm);
- realnmp = &realnm;
- }
-
-top:
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, realnmp)) {
- if (realnmp)
- pn_free(realnmp);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
-
- if (error = zfs_zaccess_delete(dzp, zp, cr)) {
- goto out;
- }
-
- /*
- * Need to use rmdir for removing directories.
- */
- if (vp->v_type == VDIR) {
- error = EPERM;
- goto out;
- }
-
- vnevent_remove(vp, dvp, name, ct);
-
- if (realnmp)
- dnlc_remove(dvp, realnmp->pn_buf);
- else
- dnlc_remove(dvp, name);
-
- mutex_enter(&vp->v_lock);
- may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
- mutex_exit(&vp->v_lock);
-
- /*
- * We may delete the znode now, or we may put it in the unlinked set;
- * it depends on whether we're the last link, and on whether there are
- * other holds on the vnode. So we dmu_tx_hold() the right things to
- * allow for either case.
- */
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
- if (may_delete_now)
- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-
- /* are there any extended attributes? */
- if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
- /* XXX - do we need this if we are deleting? */
- dmu_tx_hold_bonus(tx, xattr_obj);
- }
-
- /* are there any additional acls */
- if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
- may_delete_now)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-
- /* charge as an update -- would be nice not to charge at all */
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- if (realnmp)
- pn_free(realnmp);
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Remove the directory entry.
- */
- error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
-
- if (error) {
- dmu_tx_commit(tx);
- goto out;
- }
-
- if (unlinked) {
- mutex_enter(&vp->v_lock);
- delete_now = may_delete_now &&
- vp->v_count == 1 && !vn_has_cached_data(vp) &&
- zp->z_phys->zp_xattr == xattr_obj &&
- zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
- mutex_exit(&vp->v_lock);
- }
-
- if (delete_now) {
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT3U(error, ==, 0);
- ASSERT3U(xzp->z_phys->zp_links, ==, 2);
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = 1;
- xzp->z_phys->zp_links = 0;
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
- zp->z_phys->zp_xattr = 0; /* probably unnecessary */
- }
- mutex_enter(&zp->z_lock);
- mutex_enter(&vp->v_lock);
- vp->v_count--;
- ASSERT3U(vp->v_count, ==, 0);
- mutex_exit(&vp->v_lock);
- mutex_exit(&zp->z_lock);
- zfs_znode_delete(zp, tx);
- } else if (unlinked) {
- zfs_unlinked_add(zp, tx);
- }
-
- txtype = TX_REMOVE;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
-
- dmu_tx_commit(tx);
-out:
- if (realnmp)
- pn_free(realnmp);
-
- zfs_dirent_unlock(dl);
-
- if (!delete_now) {
- VN_RELE(vp);
- } else if (xzp) {
- /* this rele delayed to prevent nesting transactions */
- VN_RELE(ZTOV(xzp));
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Create a new directory and insert it into dvp using the name
- * provided. Return a pointer to the inserted directory.
- *
- * IN: dvp - vnode of directory to add subdir to.
- * dirname - name of new directory.
- * vap - attributes of new directory.
- * cr - credentials of caller.
- * ct - caller context
- * vsecp - ACL to be set
- *
- * OUT: vpp - vnode of created directory.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated
- * vp - ctime|mtime|atime updated
- */
-/*ARGSUSED*/
-static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
- caller_context_t *ct, int flags, vsecattr_t *vsecp)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog;
- zfs_dirlock_t *dl;
- uint64_t txtype;
- dmu_tx_t *tx;
- int error;
- zfs_acl_t *aclp = NULL;
- zfs_fuid_info_t *fuidp = NULL;
- int zf = ZNEW;
-
- ASSERT(vap->va_type == VDIR);
-
- /*
- * If we have an ephemeral id, ACL, or XVATTR then
- * make sure file system is at proper version
- */
-
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
- IS_EPHEMERAL(crgetgid(cr))))
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (dzp->z_phys->zp_flags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- if (zfsvfs->z_utf8 && u8_validate(dirname,
- strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (EILSEQ);
- }
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- if (vap->va_mask & AT_XVATTR)
- if ((error = secpolicy_xvattr((xvattr_t *)vap,
- crgetuid(cr), cr, vap->va_type)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * First make sure the new directory doesn't exist.
- */
-top:
- *vpp = NULL;
-
- if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
- NULL, NULL)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (vsecp && aclp == NULL) {
- error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
- if (error) {
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
- /*
- * Add a new entry to the directory.
- */
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
- IS_EPHEMERAL(crgetgid(cr))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
- if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
- return (error);
- }
-
- /*
- * Create new node.
- */
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
-
- if (aclp)
- zfs_acl_free(aclp);
-
- /*
- * Now put new name in parent dir.
- */
- (void) zfs_link_create(dl, zp, tx, ZNEW);
-
- *vpp = ZTOV(zp);
-
- txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
-
- if (fuidp)
- zfs_fuid_info_free(fuidp);
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Remove a directory subdir entry. If the current working
- * directory is the same as the subdir to be removed, the
- * remove will fail.
- *
- * IN: dvp - vnode of directory to remove from.
- * name - name of directory to be removed.
- * cwd - vnode of current working directory.
- * cr - credentials of caller.
- * ct - caller context
- * flags - case flags
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
- caller_context_t *ct, int flags)
-{
- znode_t *dzp = VTOZ(dvp);
- znode_t *zp;
- vnode_t *vp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
- int zflg = ZEXISTS;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-top:
- zp = NULL;
-
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
-
- if (error = zfs_zaccess_delete(dzp, zp, cr)) {
- goto out;
- }
-
- if (vp->v_type != VDIR) {
- error = ENOTDIR;
- goto out;
- }
-
- if (vp == cwd) {
- error = EINVAL;
- goto out;
- }
-
- vnevent_rmdir(vp, dvp, name, ct);
-
- /*
- * Grab a lock on the directory to make sure that noone is
- * trying to add (or lookup) entries while we are removing it.
- */
- rw_enter(&zp->z_name_lock, RW_WRITER);
-
- /*
- * Grab a lock on the parent pointer to make sure we play well
- * with the treewalk and directory rename code.
- */
- rw_enter(&zp->z_parent_lock, RW_WRITER);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
-
- if (error == 0) {
- uint64_t txtype = TX_RMDIR;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
- }
-
- dmu_tx_commit(tx);
-
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
-out:
- zfs_dirent_unlock(dl);
-
- VN_RELE(vp);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Read as many directory entries as will fit into the provided
- * buffer from the given directory cursor position (specified in
- * the uio structure.
- *
- * IN: vp - vnode of directory to read.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * cr - credentials of caller.
- * ct - caller context
- * flags - case flags
- *
- * OUT: uio - updated offset and range, buffer filled.
- * eofp - set to true if end-of-file detected.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - atime updated
- *
- * Note that the low 4 bits of the cookie returned by zap is always zero.
- * This allows us to use the low range for "special" directory entries:
- * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
- * we use the offset 2 for the '.zfs' directory.
- */
-/* ARGSUSED */
-static int
-zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
- caller_context_t *ct, int flags)
-{
- znode_t *zp = VTOZ(vp);
- iovec_t *iovp;
- edirent_t *eodp;
- dirent64_t *odp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os;
- caddr_t outbuf;
- size_t bufsize;
- zap_cursor_t zc;
- zap_attribute_t zap;
- uint_t bytes_wanted;
- uint64_t offset; /* must be unsigned; checks for < 1 */
- int local_eof;
- int outcount;
- int error;
- uint8_t prefetch;
- boolean_t check_sysattrs;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * If we are not given an eof variable,
- * use a local one.
- */
- if (eofp == NULL)
- eofp = &local_eof;
-
- /*
- * Check for valid iov_len.
- */
- if (uio->uio_iov->iov_len <= 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * Quit if directory has been removed (posix)
- */
- if ((*eofp = zp->z_unlinked) != 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- error = 0;
- os = zfsvfs->z_os;
- offset = uio->uio_loffset;
- prefetch = zp->z_zn_prefetch;
-
- /*
- * Initialize the iterator cursor.
- */
- if (offset <= 3) {
- /*
- * Start iteration from the beginning of the directory.
- */
- zap_cursor_init(&zc, os, zp->z_id);
- } else {
- /*
- * The offset is a serialized cursor.
- */
- zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
- }
-
- /*
- * Get space to change directory entries into fs independent format.
- */
- iovp = uio->uio_iov;
- bytes_wanted = iovp->iov_len;
- if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
- bufsize = bytes_wanted;
- outbuf = kmem_alloc(bufsize, KM_SLEEP);
- odp = (struct dirent64 *)outbuf;
- } else {
- bufsize = bytes_wanted;
- odp = (struct dirent64 *)iovp->iov_base;
- }
- eodp = (struct edirent *)odp;
-
- /*
- * If this VFS supports system attributes; and we're looking at an
- * extended attribute directory; and we care about normalization
- * conflicts on this vfs; then we must check for normalization
- * conflicts with the sysattr name space.
- */
- check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) &&
- (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
- (flags & V_RDDIR_ENTFLAGS);
-
- /*
- * Transform to file-system independent format
- */
- outcount = 0;
- while (outcount < bytes_wanted) {
- ino64_t objnum;
- ushort_t reclen;
- off64_t *next;
-
- /*
- * Special case `.', `..', and `.zfs'.
- */
- if (offset == 0) {
- (void) strcpy(zap.za_name, ".");
- zap.za_normalization_conflict = 0;
- objnum = zp->z_id;
- } else if (offset == 1) {
- (void) strcpy(zap.za_name, "..");
- zap.za_normalization_conflict = 0;
- objnum = zp->z_phys->zp_parent;
- } else if (offset == 2 && zfs_show_ctldir(zp)) {
- (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
- zap.za_normalization_conflict = 0;
- objnum = ZFSCTL_INO_ROOT;
- } else {
- /*
- * Grab next entry.
- */
- if (error = zap_cursor_retrieve(&zc, &zap)) {
- if ((*eofp = (error == ENOENT)) != 0)
- break;
- else
- goto update;
- }
-
- if (zap.za_integer_length != 8 ||
- zap.za_num_integers != 1) {
- cmn_err(CE_WARN, "zap_readdir: bad directory "
- "entry, obj = %lld, offset = %lld\n",
- (u_longlong_t)zp->z_id,
- (u_longlong_t)offset);
- error = ENXIO;
- goto update;
- }
-
- objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
- /*
- * MacOS X can extract the object type here such as:
- * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
- */
-
- if (check_sysattrs && !zap.za_normalization_conflict) {
- zap.za_normalization_conflict =
- xattr_sysattr_casechk(zap.za_name);
- }
- }
-
- if (flags & V_RDDIR_ENTFLAGS)
- reclen = EDIRENT_RECLEN(strlen(zap.za_name));
- else
- reclen = DIRENT64_RECLEN(strlen(zap.za_name));
-
- /*
- * Will this entry fit in the buffer?
- */
- if (outcount + reclen > bufsize) {
- /*
- * Did we manage to fit anything in the buffer?
- */
- if (!outcount) {
- error = EINVAL;
- goto update;
- }
- break;
- }
- if (flags & V_RDDIR_ENTFLAGS) {
- /*
- * Add extended flag entry:
- */
- eodp->ed_ino = objnum;
- eodp->ed_reclen = reclen;
- /* NOTE: ed_off is the offset for the *next* entry */
- next = &(eodp->ed_off);
- eodp->ed_eflags = zap.za_normalization_conflict ?
- ED_CASE_CONFLICT : 0;
- (void) strncpy(eodp->ed_name, zap.za_name,
- EDIRENT_NAMELEN(reclen));
- eodp = (edirent_t *)((intptr_t)eodp + reclen);
- } else {
- /*
- * Add normal entry:
- */
- odp->d_ino = objnum;
- odp->d_reclen = reclen;
- /* NOTE: d_off is the offset for the *next* entry */
- next = &(odp->d_off);
- (void) strncpy(odp->d_name, zap.za_name,
- DIRENT64_NAMELEN(reclen));
- odp = (dirent64_t *)((intptr_t)odp + reclen);
- }
- outcount += reclen;
-
- ASSERT(outcount <= bufsize);
-
- /* Prefetch znode */
- if (prefetch)
- dmu_prefetch(os, objnum, 0, 0);
-
- /*
- * Move to the next entry, fill in the previous offset.
- */
- if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
- zap_cursor_advance(&zc);
- offset = zap_cursor_serialize(&zc);
- } else {
- offset += 1;
- }
- *next = offset;
- }
- zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
-
- if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
- iovp->iov_base += outcount;
- iovp->iov_len -= outcount;
- uio->uio_resid -= outcount;
- } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
- /*
- * Reset the pointer.
- */
- offset = uio->uio_loffset;
- }
-
-update:
- zap_cursor_fini(&zc);
- if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
- kmem_free(outbuf, bufsize);
-
- if (error == ENOENT)
- error = 0;
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-
- uio->uio_loffset = offset;
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-ulong_t zfs_fsync_sync_cnt = 4;
-
-static int
-zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- /*
- * Regardless of whether this is required for standards conformance,
- * this is the logical behavior when fsync() is called on a file with
- * dirty pages. We use B_ASYNC since the ZIL transactions are already
- * going to be pushed out as part of the zil_commit().
- */
- if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
- (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
- (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
-
- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-
-/*
- * Get the requested file attributes and place them in the provided
- * vattr structure.
- *
- * IN: vp - vnode of file.
- * vap - va_mask identifies requested attributes.
- * If AT_XVATTR set, then optional attrs are requested
- * flags - ATTR_NOACLCHECK (CIFS server context)
- * cr - credentials of caller.
- * ct - caller context
- *
- * OUT: vap - attribute values.
- *
- * RETURN: 0 (always succeeds)
- */
-/* ARGSUSED */
-static int
-zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_phys_t *pzp;
- int error = 0;
- uint64_t links;
- xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
- xoptattr_t *xoap = NULL;
- boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
-
- mutex_enter(&zp->z_lock);
-
- /*
- * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
- * Also, if we are the owner don't bother, since owner should
- * always be allowed to read basic attributes of file.
- */
- if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
- (pzp->zp_uid != crgetuid(cr))) {
- if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
- skipaclchk, cr)) {
- mutex_exit(&zp->z_lock);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- /*
- * Return all attributes. It's cheaper to provide the answer
- * than to determine whether we were asked the question.
- */
-
- vap->va_type = vp->v_type;
- vap->va_mode = pzp->zp_mode & MODEMASK;
- zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
- vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
- vap->va_nodeid = zp->z_id;
- if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
- links = pzp->zp_links + 1;
- else
- links = pzp->zp_links;
- vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
- vap->va_size = pzp->zp_size;
- vap->va_rdev = vp->v_rdev;
- vap->va_seq = zp->z_seq;
-
- /*
- * Add in any requested optional attributes and the create time.
- * Also set the corresponding bits in the returned attribute bitmap.
- */
- if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
- xoap->xoa_archive =
- ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
- XVA_SET_RTN(xvap, XAT_ARCHIVE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
- xoap->xoa_readonly =
- ((pzp->zp_flags & ZFS_READONLY) != 0);
- XVA_SET_RTN(xvap, XAT_READONLY);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
- xoap->xoa_system =
- ((pzp->zp_flags & ZFS_SYSTEM) != 0);
- XVA_SET_RTN(xvap, XAT_SYSTEM);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
- xoap->xoa_hidden =
- ((pzp->zp_flags & ZFS_HIDDEN) != 0);
- XVA_SET_RTN(xvap, XAT_HIDDEN);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- xoap->xoa_nounlink =
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
- XVA_SET_RTN(xvap, XAT_NOUNLINK);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- xoap->xoa_immutable =
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
- XVA_SET_RTN(xvap, XAT_IMMUTABLE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- xoap->xoa_appendonly =
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
- XVA_SET_RTN(xvap, XAT_APPENDONLY);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- xoap->xoa_nodump =
- ((pzp->zp_flags & ZFS_NODUMP) != 0);
- XVA_SET_RTN(xvap, XAT_NODUMP);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
- xoap->xoa_opaque =
- ((pzp->zp_flags & ZFS_OPAQUE) != 0);
- XVA_SET_RTN(xvap, XAT_OPAQUE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
- xoap->xoa_av_quarantined =
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
- XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- xoap->xoa_av_modified =
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
- XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
- vp->v_type == VREG &&
- (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
- /*
- * Only VREG files have anti-virus scanstamps, so we
- * won't conflict with symlinks in the bonus buffer.
- */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len <= doi.doi_bonus_size) {
- /*
- * pzp points to the start of the
- * znode_phys_t. pzp + 1 points to the
- * first byte after the znode_phys_t.
- */
- (void) memcpy(xoap->xoa_av_scanstamp,
- pzp + 1,
- sizeof (xoap->xoa_av_scanstamp));
- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
- XVA_SET_RTN(xvap, XAT_CREATETIME);
- }
- }
-
- ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
- ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
- ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
-
- mutex_exit(&zp->z_lock);
-
- dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
-
- if (zp->z_blksz == 0) {
- /*
- * Block size hasn't been set; suggest maximal I/O transfers.
- */
- vap->va_blksize = zfsvfs->z_max_blksz;
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Set the file attributes to the values contained in the
- * vattr structure.
- *
- * IN: vp - vnode of file to be modified.
- * vap - new attribute values.
- * If AT_XVATTR set, then optional attrs are being set
- * flags - ATTR_UTIME set if non-default time values provided.
- * - ATTR_NOACLCHECK (CIFS context only).
- * cr - credentials of caller.
- * ct - caller context
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - ctime updated, mtime updated if size changed.
- */
-/* ARGSUSED */
-static int
-zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- znode_phys_t *pzp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog;
- dmu_tx_t *tx;
- vattr_t oldva;
- uint_t mask = vap->va_mask;
- uint_t saved_mask;
- int trim_mask = 0;
- uint64_t new_mode;
- znode_t *attrzp;
- int need_policy = FALSE;
- int err;
- zfs_fuid_info_t *fuidp = NULL;
- xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
- xoptattr_t *xoap;
- zfs_acl_t *aclp = NULL;
- boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- if (mask == 0)
- return (0);
-
- if (mask & AT_NOSET)
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- pzp = zp->z_phys;
- zilog = zfsvfs->z_log;
-
- /*
- * Make sure that if we have ephemeral uid/gid or xvattr specified
- * that file system is at proper version level
- */
-
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
- ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
- (mask & AT_XVATTR))) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- if (mask & AT_SIZE && vp->v_type == VDIR) {
- ZFS_EXIT(zfsvfs);
- return (EISDIR);
- }
-
- if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * If this is an xvattr_t, then get a pointer to the structure of
- * optional attributes. If this is NULL, then we have a vattr_t.
- */
- xoap = xva_getxoptattr(xvap);
-
- /*
- * Immutable files can only alter immutable bit and atime
- */
- if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
- ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
- ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- /*
- * Verify timestamps doesn't overflow 32 bits.
- * ZFS can handle large timestamps, but 32bit syscalls can't
- * handle times greater than 2039. This check should be removed
- * once large timestamps are fully supported.
- */
- if (mask & (AT_ATIME | AT_MTIME)) {
- if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
- ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
- ZFS_EXIT(zfsvfs);
- return (EOVERFLOW);
- }
- }
-
-top:
- attrzp = NULL;
-
- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- ZFS_EXIT(zfsvfs);
- return (EROFS);
- }
-
- /*
- * First validate permissions
- */
-
- if (mask & AT_SIZE) {
- err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- /*
- * XXX - Note, we are not providing any open
- * mode flags here (like FNDELAY), so we may
- * block if there are locks present... this
- * should be addressed in openat().
- */
- do {
- err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
- /* NB: we already did dmu_tx_wait() if necessary */
- } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- }
-
- if (mask & (AT_ATIME|AT_MTIME) ||
- ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
- XVA_ISSET_REQ(xvap, XAT_READONLY) ||
- XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
- XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
- XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
- need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
- skipaclchk, cr);
-
- if (mask & (AT_UID|AT_GID)) {
- int idmask = (mask & (AT_UID|AT_GID));
- int take_owner;
- int take_group;
-
- /*
- * NOTE: even if a new mode is being set,
- * we may clear S_ISUID/S_ISGID bits.
- */
-
- if (!(mask & AT_MODE))
- vap->va_mode = pzp->zp_mode;
-
- /*
- * Take ownership or chgrp to group we are a member of
- */
-
- take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
- take_group = (mask & AT_GID) &&
- zfs_groupmember(zfsvfs, vap->va_gid, cr);
-
- /*
- * If both AT_UID and AT_GID are set then take_owner and
- * take_group must both be set in order to allow taking
- * ownership.
- *
- * Otherwise, send the check through secpolicy_vnode_setattr()
- *
- */
-
- if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
- ((idmask == AT_UID) && take_owner) ||
- ((idmask == AT_GID) && take_group)) {
- if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
- skipaclchk, cr) == 0) {
- /*
- * Remove setuid/setgid for non-privileged users
- */
- secpolicy_setid_clear(vap, cr);
- trim_mask = (mask & (AT_UID|AT_GID));
- } else {
- need_policy = TRUE;
- }
- } else {
- need_policy = TRUE;
- }
- }
-
- mutex_enter(&zp->z_lock);
- oldva.va_mode = pzp->zp_mode;
- zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
- if (mask & AT_XVATTR) {
- if ((need_policy == FALSE) &&
- (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
- xoap->xoa_appendonly !=
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
- (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
- xoap->xoa_nounlink !=
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
- (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
- xoap->xoa_immutable !=
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
- (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
- xoap->xoa_nodump !=
- ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
- (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
- xoap->xoa_av_modified !=
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
- ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
- ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
- xoap->xoa_av_quarantined !=
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
- (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
- (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
- need_policy = TRUE;
- }
- }
-
- mutex_exit(&zp->z_lock);
-
- if (mask & AT_MODE) {
- if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
- err = secpolicy_setid_setsticky_clear(vp, vap,
- &oldva, cr);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- trim_mask |= AT_MODE;
- } else {
- need_policy = TRUE;
- }
- }
-
- if (need_policy) {
- /*
- * If trim_mask is set then take ownership
- * has been granted or write_acl is present and user
- * has the ability to modify mode. In that case remove
- * UID|GID and or MODE from mask so that
- * secpolicy_vnode_setattr() doesn't revoke it.
- */
-
- if (trim_mask) {
- saved_mask = vap->va_mask;
- vap->va_mask &= ~trim_mask;
- }
- err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
- (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- if (trim_mask)
- vap->va_mask |= saved_mask;
- }
-
- /*
- * secpolicy_vnode_setattr, or take ownership may have
- * changed va_mask
- */
- mask = vap->va_mask;
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
- if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
- ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
-
- if (mask & AT_MODE) {
- uint64_t pmode = pzp->zp_mode;
-
- new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- if (pzp->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL from old V0 format to new V1 */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- pzp->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- aclp->z_acl_bytes);
- }
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- }
- }
-
- if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
- err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
- if (err) {
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
- return (err);
- }
- dmu_tx_hold_bonus(tx, attrzp->z_id);
- }
-
- err = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (err) {
- if (attrzp)
- VN_RELE(ZTOV(attrzp));
-
- if (aclp) {
- zfs_acl_free(aclp);
- aclp = NULL;
- }
-
- if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
- /*
- * Set each attribute requested.
- * We group settings according to the locks they need to acquire.
- *
- * Note: you cannot set ctime directly, although it will be
- * updated as a side-effect of calling this function.
- */
-
- mutex_enter(&zp->z_lock);
-
- if (mask & AT_MODE) {
- mutex_enter(&zp->z_acl_lock);
- zp->z_phys->zp_mode = new_mode;
- err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
- ASSERT3U(err, ==, 0);
- mutex_exit(&zp->z_acl_lock);
- }
-
- if (attrzp)
- mutex_enter(&attrzp->z_lock);
-
- if (mask & AT_UID) {
- pzp->zp_uid = zfs_fuid_create(zfsvfs,
- vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
- if (attrzp) {
- attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
- vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
- }
- }
-
- if (mask & AT_GID) {
- pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
- cr, ZFS_GROUP, tx, &fuidp);
- if (attrzp)
- attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
- vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
- }
-
- if (aclp)
- zfs_acl_free(aclp);
-
- if (attrzp)
- mutex_exit(&attrzp->z_lock);
-
- if (mask & AT_ATIME)
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
- if (mask & AT_MTIME)
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
- if (mask & AT_SIZE)
- zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
- else if (mask != 0)
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
- /*
- * Do this after setting timestamps to prevent timestamp
- * update from toggling bit
- */
-
- if (xoap && (mask & AT_XVATTR)) {
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
- ASSERT(vp->v_type == VREG);
-
- /* Grow the bonus buffer if necessary. */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len > doi.doi_bonus_size)
- VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
- }
- zfs_xvattr_set(zp, xvap);
- }
-
- if (mask != 0)
- zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
-
- if (fuidp)
- zfs_fuid_info_free(fuidp);
- mutex_exit(&zp->z_lock);
-
- if (attrzp)
- VN_RELE(ZTOV(attrzp));
-
- dmu_tx_commit(tx);
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-typedef struct zfs_zlock {
- krwlock_t *zl_rwlock; /* lock we acquired */
- znode_t *zl_znode; /* znode we held */
- struct zfs_zlock *zl_next; /* next in list */
-} zfs_zlock_t;
-
-/*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
- */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
-
- while ((zl = *zlpp) != NULL) {
- if (zl->zl_znode != NULL)
- VN_RELE(ZTOV(zl->zl_znode));
- rw_exit(zl->zl_rwlock);
- *zlpp = zl->zl_next;
- kmem_free(zl, sizeof (*zl));
- }
-}
-
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
- znode_t *zp = tdzp;
- uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t *oidp = &zp->z_id;
- krwlock_t *rwlp = &szp->z_parent_lock;
- krw_t rw = RW_WRITER;
-
- /*
- * First pass write-locks szp and compares to zp->z_id.
- * Later passes read-lock zp and compare to zp->z_parent.
- */
- do {
- if (!rw_tryenter(rwlp, rw)) {
- /*
- * Another thread is renaming in this path.
- * Note that if we are a WRITER, we don't have any
- * parent_locks held yet.
- */
- if (rw == RW_READER && zp->z_id > szp->z_id) {
- /*
- * Drop our locks and restart
- */
- zfs_rename_unlock(&zl);
- *zlpp = NULL;
- zp = tdzp;
- oidp = &zp->z_id;
- rwlp = &szp->z_parent_lock;
- rw = RW_WRITER;
- continue;
- } else {
- /*
- * Wait for other thread to drop its locks
- */
- rw_enter(rwlp, rw);
- }
- }
-
- zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
- zl->zl_rwlock = rwlp;
- zl->zl_znode = NULL;
- zl->zl_next = *zlpp;
- *zlpp = zl;
-
- if (*oidp == szp->z_id) /* We're a descendant of szp */
- return (EINVAL);
-
- if (*oidp == rootid) /* We've hit the top */
- return (0);
-
- if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
- if (error)
- return (error);
- zl->zl_znode = zp;
- }
- oidp = &zp->z_phys->zp_parent;
- rwlp = &zp->z_parent_lock;
- rw = RW_READER;
-
- } while (zp->z_id != sdzp->z_id);
-
- return (0);
-}
-
-/*
- * Move an entry from the provided source directory to the target
- * directory. Change the entry name as indicated.
- *
- * IN: sdvp - Source directory containing the "old entry".
- * snm - Old entry name.
- * tdvp - Target directory to contain the "new entry".
- * tnm - New entry name.
- * cr - credentials of caller.
- * ct - caller context
- * flags - case flags
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * sdvp,tdvp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
- caller_context_t *ct, int flags)
-{
- znode_t *tdzp, *szp, *tzp;
- znode_t *sdzp = VTOZ(sdvp);
- zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
- zilog_t *zilog;
- vnode_t *realvp;
- zfs_dirlock_t *sdl, *tdl;
- dmu_tx_t *tx;
- zfs_zlock_t *zl;
- int cmp, serr, terr;
- int error = 0;
- int zflg = 0;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(sdzp);
- zilog = zfsvfs->z_log;
-
- /*
- * Make sure we have the real vp for the target directory.
- */
- if (VOP_REALVP(tdvp, &realvp, ct) == 0)
- tdvp = realvp;
-
- if (tdvp->v_vfsp != sdvp->v_vfsp) {
- ZFS_EXIT(zfsvfs);
- return (EXDEV);
- }
-
- tdzp = VTOZ(tdvp);
- ZFS_VERIFY_ZP(tdzp);
- if (zfsvfs->z_utf8 && u8_validate(tnm,
- strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (EILSEQ);
- }
-
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-
-top:
- szp = NULL;
- tzp = NULL;
- zl = NULL;
-
- /*
- * This is to prevent the creation of links into attribute space
- * by renaming a linked file into/outof an attribute directory.
- * See the comment in zfs_link() for why this is considered bad.
- */
- if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
- (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * Lock source and target directory entries. To prevent deadlock,
- * a lock ordering must be defined. We lock the directory with
- * the smallest object id first, or if it's a tie, the one with
- * the lexically first name.
- */
- if (sdzp->z_id < tdzp->z_id) {
- cmp = -1;
- } else if (sdzp->z_id > tdzp->z_id) {
- cmp = 1;
- } else {
- /*
- * First compare the two name arguments without
- * considering any case folding.
- */
- int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
- cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
- ASSERT(error == 0 || !zfsvfs->z_utf8);
- if (cmp == 0) {
- /*
- * POSIX: "If the old argument and the new argument
- * both refer to links to the same existing file,
- * the rename() function shall return successfully
- * and perform no other action."
- */
- ZFS_EXIT(zfsvfs);
- return (0);
- }
- /*
- * If the file system is case-folding, then we may
- * have some more checking to do. A case-folding file
- * system is either supporting mixed case sensitivity
- * access or is completely case-insensitive. Note
- * that the file system is always case preserving.
- *
- * In mixed sensitivity mode case sensitive behavior
- * is the default. FIGNORECASE must be used to
- * explicitly request case insensitive behavior.
- *
- * If the source and target names provided differ only
- * by case (e.g., a request to rename 'tim' to 'Tim'),
- * we will treat this as a special case in the
- * case-insensitive mode: as long as the source name
- * is an exact match, we will allow this to proceed as
- * a name-change request.
- */
- if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
- (zfsvfs->z_case == ZFS_CASE_MIXED &&
- flags & FIGNORECASE)) &&
- u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
- &error) == 0) {
- /*
- * case preserving rename request, require exact
- * name matches
- */
- zflg |= ZCIEXACT;
- zflg &= ~ZCILOOK;
- }
- }
-
- if (cmp < 0) {
- serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
- ZEXISTS | zflg, NULL, NULL);
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
- } else {
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, zflg, NULL, NULL);
- serr = zfs_dirent_lock(&sdl,
- sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
- NULL, NULL);
- }
-
- if (serr) {
- /*
- * Source entry invalid or not there.
- */
- if (!terr) {
- zfs_dirent_unlock(tdl);
- if (tzp)
- VN_RELE(ZTOV(tzp));
- }
- if (strcmp(snm, "..") == 0)
- serr = EINVAL;
- ZFS_EXIT(zfsvfs);
- return (serr);
- }
- if (terr) {
- zfs_dirent_unlock(sdl);
- VN_RELE(ZTOV(szp));
- if (strcmp(tnm, "..") == 0)
- terr = EINVAL;
- ZFS_EXIT(zfsvfs);
- return (terr);
- }
-
- /*
- * Must have write access at the source to remove the old entry
- * and write access at the target to create the new entry.
- * Note that if target and source are the same, this can be
- * done in a single check.
- */
-
- if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
- goto out;
-
- if (ZTOV(szp)->v_type == VDIR) {
- /*
- * Check to make sure rename is valid.
- * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
- */
- if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
- goto out;
- }
-
- /*
- * Does target exist?
- */
- if (tzp) {
- /*
- * Source and target must be the same type.
- */
- if (ZTOV(szp)->v_type == VDIR) {
- if (ZTOV(tzp)->v_type != VDIR) {
- error = ENOTDIR;
- goto out;
- }
- } else {
- if (ZTOV(tzp)->v_type == VDIR) {
- error = EISDIR;
- goto out;
- }
- }
- /*
- * POSIX dictates that when the source and target
- * entries refer to the same file object, rename
- * must do nothing and exit without error.
- */
- if (szp->z_id == tzp->z_id) {
- error = 0;
- goto out;
- }
- }
-
- vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
- if (tzp)
- vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
-
- /*
- * notify the target directory if it is not the same
- * as source directory.
- */
- if (tdvp != sdvp) {
- vnevent_rename_dest_dir(tdvp, ct);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
- dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
- dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
- dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
- if (sdzp != tdzp)
- dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
- if (tzp)
- dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (tzp) /* Attempt to remove the existing target */
- error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
-
- if (error == 0) {
- error = zfs_link_create(tdl, szp, tx, ZRENAMING);
- if (error == 0) {
- szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
-
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- ASSERT(error == 0);
-
- zfs_log_rename(zilog, tx,
- TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
- sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
- }
- }
-
- dmu_tx_commit(tx);
-out:
- if (zl != NULL)
- zfs_rename_unlock(&zl);
-
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Insert the indicated symbolic reference entry into the directory.
- *
- * IN: dvp - Directory to contain new symbolic link.
- * link - Name for new symlink entry.
- * vap - Attributes of new entry.
- * target - Target path of new symlink.
- * cr - credentials of caller.
- * ct - caller context
- * flags - case flags
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
- caller_context_t *ct, int flags)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog;
- int len = strlen(link);
- int error;
- int zflg = ZNEW;
- zfs_fuid_info_t *fuidp = NULL;
-
- ASSERT(vap->va_type == VLNK);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (EILSEQ);
- }
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-top:
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (len > MAXPATHLEN) {
- ZFS_EXIT(zfsvfs);
- return (ENAMETOOLONG);
- }
-
- /*
- * Attempt to lock directory; fail if entry already exists.
- */
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
- dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
- if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
- /*
- * Create a new object for the symlink.
- * Put the link content into bonus buffer if it will fit;
- * otherwise, store it just like any other file data.
- */
- if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
- if (len != 0)
- bcopy(link, zp->z_phys + 1, len);
- } else {
- dmu_buf_t *dbp;
-
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
- /*
- * Nothing can access the znode yet so no locking needed
- * for growing the znode's blocksize.
- */
- zfs_grow_blocksize(zp, len, tx);
-
- VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
- zp->z_id, 0, FTAG, &dbp));
- dmu_buf_will_dirty(dbp, tx);
-
- ASSERT3U(len, <=, dbp->db_size);
- bcopy(link, dbp->db_data, len);
- dmu_buf_rele(dbp, FTAG);
- }
- zp->z_phys->zp_size = len;
-
- /*
- * Insert the new object into the directory.
- */
- (void) zfs_link_create(dl, zp, tx, ZNEW);
-out:
- if (error == 0) {
- uint64_t txtype = TX_SYMLINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
- }
- if (fuidp)
- zfs_fuid_info_free(fuidp);
-
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- VN_RELE(ZTOV(zp));
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Return, in the buffer contained in the provided uio structure,
- * the symbolic path referred to by vp.
- *
- * IN: vp - vnode of symbolic link.
- * uoip - structure to contain the link path.
- * cr - credentials of caller.
- * ct - caller context
- *
- * OUT: uio - structure to contain the link path.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- size_t bufsz;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- bufsz = (size_t)zp->z_phys->zp_size;
- if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
- error = uiomove(zp->z_phys + 1,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- } else {
- dmu_buf_t *dbp;
- error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- error = uiomove(dbp->db_data,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- dmu_buf_rele(dbp, FTAG);
- }
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Insert a new entry into directory tdvp referencing svp.
- *
- * IN: tdvp - Directory to contain new entry.
- * svp - vnode of new entry.
- * name - name of new entry.
- * cr - credentials of caller.
- * ct - caller context
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * tdvp - ctime|mtime updated
- * svp - ctime updated
- */
-/* ARGSUSED */
-static int
-zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
- caller_context_t *ct, int flags)
-{
- znode_t *dzp = VTOZ(tdvp);
- znode_t *tzp, *szp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- vnode_t *realvp;
- int error;
- int zf = ZNEW;
- uid_t owner;
-
- ASSERT(tdvp->v_type == VDIR);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (VOP_REALVP(svp, &realvp, ct) == 0)
- svp = realvp;
-
- if (svp->v_vfsp != tdvp->v_vfsp) {
- ZFS_EXIT(zfsvfs);
- return (EXDEV);
- }
- szp = VTOZ(svp);
- ZFS_VERIFY_ZP(szp);
-
- if (zfsvfs->z_utf8 && u8_validate(name,
- strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (EILSEQ);
- }
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
-top:
- /*
- * We do not support links between attributes and non-attributes
- * because of the potential security risk of creating links
- * into "normal" file space in order to circumvent restrictions
- * imposed in attribute space.
- */
- if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
- (dzp->z_phys->zp_flags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * POSIX dictates that we return EPERM here.
- * Better choices include ENOTSUP or EISDIR.
- */
- if (svp->v_type == VDIR) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
- if (owner != crgetuid(cr) &&
- secpolicy_basic_link(cr) != 0) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Attempt to lock directory; fail if entry already exists.
- */
- error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- error = zfs_link_create(dl, szp, tx, 0);
-
- if (error == 0) {
- uint64_t txtype = TX_LINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_link(zilog, tx, txtype, dzp, szp, name);
- }
-
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- if (error == 0) {
- vnevent_link(svp, ct);
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * zfs_null_putapage() is used when the file system has been force
- * unmounted. It just drops the pages.
- */
-/* ARGSUSED */
-static int
-zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
-{
- pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
- return (0);
-}
-
-/*
- * Push a page out to disk, klustering if possible.
- *
- * IN: vp - file to push page to.
- * pp - page to push.
- * flags - additional flags.
- * cr - credentials of caller.
- *
- * OUT: offp - start of range pushed.
- * lenp - len of range pushed.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * NOTE: callers must have locked the page to be pushed. On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- dmu_tx_t *tx;
- rl_t *rl;
- u_offset_t off, koff;
- size_t len, klen;
- uint64_t filesz;
- int err;
-
- filesz = zp->z_phys->zp_size;
- off = pp->p_offset;
- len = PAGESIZE;
- /*
- * If our blocksize is bigger than the page size, try to kluster
- * muiltiple pages so that we write a full block (thus avoiding
- * a read-modify-write).
- */
- if (off < filesz && zp->z_blksz > PAGESIZE) {
- if (!ISP2(zp->z_blksz)) {
- /* Only one block in the file. */
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = 0;
- } else {
- klen = zp->z_blksz;
- koff = P2ALIGN(off, (u_offset_t)klen);
- }
- ASSERT(koff <= filesz);
- if (koff + klen > filesz)
- klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
- pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
- }
- ASSERT3U(btop(len), ==, btopr(len));
-top:
- rl = zfs_range_lock(zp, off, len, RL_WRITER);
- /*
- * Can't push pages past end-of-file.
- */
- filesz = zp->z_phys->zp_size;
- if (off >= filesz) {
- /* ignore all pages */
- err = 0;
- goto out;
- } else if (off + len > filesz) {
- int npages = btopr(filesz - off);
- page_t *trunc;
-
- page_list_break(&pp, &trunc, npages);
- /* ignore pages past end of file */
- if (trunc)
- pvn_write_done(trunc, flags);
- len = filesz - off;
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_write(tx, zp->z_id, off, len);
- dmu_tx_hold_bonus(tx, zp->z_id);
- err = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (err != 0) {
- if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- zfs_range_unlock(rl);
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- err = 0;
- goto top;
- }
- dmu_tx_abort(tx);
- goto out;
- }
-
- if (zp->z_blksz <= PAGESIZE) {
- caddr_t va = ppmapin(pp, PROT_READ, (caddr_t)-1);
- ASSERT3U(len, <=, PAGESIZE);
- dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
- ppmapout(va);
- } else {
- err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
- }
-
- if (err == 0) {
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
- zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0);
- dmu_tx_commit(tx);
- }
-
-out:
- zfs_range_unlock(rl);
- pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
- if (offp)
- *offp = off;
- if (lenp)
- *lenp = len;
-
- return (err);
-}
-
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- * IN: vp - vnode of file to push page data to.
- * off - position in file to put data.
- * len - amount of data to write.
- * flags - flags to control the operation.
- * cr - credentials of caller.
- * ct - caller context.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t *pp;
- size_t io_len;
- u_offset_t io_off;
- uint64_t filesz;
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (len == 0) {
- /*
- * Search the entire vp list for pages >= off.
- */
- error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
- flags, cr);
- goto out;
- }
-
- filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
- if (off > filesz) {
- /* past end of file */
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- len = MIN(len, filesz - off);
-
- for (io_off = off; io_off < off + len; io_off += io_len) {
- if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
- pp = page_lookup(vp, io_off,
- (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
- } else {
- pp = page_lookup_nowait(vp, io_off,
- (flags & B_FREE) ? SE_EXCL : SE_SHARED);
- }
-
- if (pp != NULL && pvn_getdirty(pp, flags)) {
- int err;
-
- /*
- * Found a dirty page to push
- */
- err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
- if (err)
- error = err;
- } else {
- io_len = PAGESIZE;
- }
- }
-out:
- if ((flags & B_ASYNC) == 0)
- zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*ARGSUSED*/
-void
-zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- if (zp->z_dbuf == NULL) {
- /*
- * The fs has been unmounted, or we did a
- * suspend/resume and this file no longer exists.
- */
- if (vn_has_cached_data(vp)) {
- (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
- B_INVAL, cr);
- }
-
- mutex_enter(&zp->z_lock);
- vp->v_count = 0; /* count arrives as 1 */
- mutex_exit(&zp->z_lock);
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- zfs_znode_free(zp);
- return;
- }
-
- /*
- * Attempt to push any data in the page cache. If this fails
- * we will get kicked out later in zfs_zinactive().
- */
- if (vn_has_cached_data(vp)) {
- (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
- cr);
- }
-
- if (zp->z_atime_dirty && zp->z_unlinked == 0) {
- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
- dmu_tx_hold_bonus(tx, zp->z_id);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- mutex_enter(&zp->z_lock);
- zp->z_atime_dirty = 0;
- mutex_exit(&zp->z_lock);
- dmu_tx_commit(tx);
- }
- }
-
- zfs_zinactive(zp);
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
-}
-
-/*
- * Bounds-check the seek operation.
- *
- * IN: vp - vnode seeking within
- * ooff - old file offset
- * noffp - pointer to new file offset
- * ct - caller context
- *
- * RETURN: 0 if success
- * EINVAL if new offset invalid
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
- caller_context_t *ct)
-{
- if (vp->v_type == VDIR)
- return (0);
- return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
- flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * We are following the UFS semantics with respect to mapcnt
- * here: If we see that the file is mapped already, then we will
- * return an error, but we don't worry about races between this
- * function and zfs_map().
- */
- if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
- ZFS_EXIT(zfsvfs);
- return (EAGAIN);
- }
- error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data. For efficiency, we may try to fill
- * multiple pages at once (klustering).
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
- caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
- znode_t *zp = VTOZ(vp);
- page_t *pp, *cur_pp;
- objset_t *os = zp->z_zfsvfs->z_os;
- caddr_t va;
- u_offset_t io_off, total;
- uint64_t oid = zp->z_id;
- size_t io_len;
- uint64_t filesz;
- int err;
-
- /*
- * If we are only asking for a single page don't bother klustering.
- */
- filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
- if (off >= filesz)
- return (EFAULT);
- if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
- io_off = off;
- io_len = PAGESIZE;
- pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
- } else {
- /*
- * Try to fill a kluster of pages (a blocks worth).
- */
- size_t klen;
- u_offset_t koff;
-
- if (!ISP2(zp->z_blksz)) {
- /* Only one block in the file. */
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = 0;
- } else {
- /*
- * It would be ideal to align our offset to the
- * blocksize but doing so has resulted in some
- * strange application crashes. For now, we
- * leave the offset as is and only adjust the
- * length if we are off the end of the file.
- */
- koff = off;
- klen = plsz;
- }
- ASSERT(koff <= filesz);
- if (koff + klen > filesz)
- klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff;
- ASSERT3U(off, >=, koff);
- ASSERT3U(off, <, koff + klen);
- pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
- &io_len, koff, klen, 0);
- }
- if (pp == NULL) {
- /*
- * Some other thread entered the page before us.
- * Return to zfs_getpage to retry the lookup.
- */
- *pl = NULL;
- return (0);
- }
-
- /*
- * Fill the pages in the kluster.
- */
- cur_pp = pp;
- for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
- ASSERT3U(io_off, ==, cur_pp->p_offset);
- va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
- err = dmu_read(os, oid, io_off, PAGESIZE, va);
- ppmapout(va);
- if (err) {
- /* On error, toss the entire kluster */
- pvn_read_done(pp, B_ERROR);
- return (err);
- }
- cur_pp = cur_pp->p_next;
- }
-out:
- /*
- * Fill in the page list array from the kluster. If
- * there are too many pages in the kluster, return
- * as many pages as possible starting from the desired
- * offset `off'.
- * NOTE: the page list will always be null terminated.
- */
- pvn_plist_init(pp, pl, plsz, off, io_len, rw);
-
- return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array. If plsz is greater than len, this function may
- * also return page pointers from before or after the specified
- * region (i.e. some region [off', off' + plsz]). These additional
- * pages are only returned if they are already in the cache, or were
- * created as part of a klustered read.
- *
- * IN: vp - vnode of file to get data from.
- * off - position in file to get data from.
- * len - amount of data to retrieve.
- * plsz - length of provided page list.
- * seg - segment to obtain pages for.
- * addr - virtual address of fault.
- * rw - mode of created pages.
- * cr - credentials of caller.
- * ct - caller context.
- *
- * OUT: protp - protection mode of created pages.
- * pl - list of pages created.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
- page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
- enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t *pp, **pl0 = pl;
- int need_unlock = 0, err = 0;
- offset_t orig_off;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (protp)
- *protp = PROT_ALL;
-
- /* no faultahead (for now) */
- if (pl == NULL) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /* can't fault past EOF */
- if (off >= zp->z_phys->zp_size) {
- ZFS_EXIT(zfsvfs);
- return (EFAULT);
- }
- orig_off = off;
-
- /*
- * If we already own the lock, then we must be page faulting
- * in the middle of a write to this file (i.e., we are writing
- * to this file using data from a mapped region of the file).
- */
- if (rw_owner(&zp->z_map_lock) != curthread) {
- rw_enter(&zp->z_map_lock, RW_WRITER);
- need_unlock = TRUE;
- }
-
- /*
- * Loop through the requested range [off, off + len] looking
- * for pages. If we don't find a page, we will need to create
- * a new page and fill it with data from the file.
- */
- while (len > 0) {
- if (plsz < PAGESIZE)
- break;
- if (pp = page_lookup(vp, off, SE_SHARED)) {
- *pl++ = pp;
- off += PAGESIZE;
- addr += PAGESIZE;
- len -= PAGESIZE;
- plsz -= PAGESIZE;
- } else {
- err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
- if (err)
- goto out;
- /*
- * klustering may have changed our region
- * to be block aligned.
- */
- if (((pp = *pl) != 0) && (off != pp->p_offset)) {
- int delta = off - pp->p_offset;
- len += delta;
- off -= delta;
- addr -= delta;
- }
- while (*pl) {
- pl++;
- off += PAGESIZE;
- addr += PAGESIZE;
- plsz -= PAGESIZE;
- if (len > PAGESIZE)
- len -= PAGESIZE;
- else
- len = 0;
- }
- }
- }
-
- /*
- * Fill out the page array with any pages already in the cache.
- */
- while (plsz > 0) {
- pp = page_lookup_nowait(vp, off, SE_SHARED);
- if (pp == NULL)
- break;
- *pl++ = pp;
- off += PAGESIZE;
- plsz -= PAGESIZE;
- }
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-out:
- /*
- * We can't grab the range lock for the page as reader which would
- * stop truncation as this leads to deadlock. So we need to recheck
- * the file size.
- */
- if (orig_off >= zp->z_phys->zp_size)
- err = EFAULT;
- if (err) {
- /*
- * Release any pages we have previously locked.
- */
- while (pl > pl0)
- page_unlock(*--pl);
- }
-
- *pl = NULL;
-
- if (need_unlock)
- rw_exit(&zp->z_map_lock);
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-/*
- * Request a memory map for a section of a file. This code interacts
- * with common code and the VM system as follows:
- *
- * common code calls mmap(), which ends up in smmap_common()
- *
- * this calls VOP_MAP(), which takes you into (say) zfs
- *
- * zfs_map() calls as_map(), passing segvn_create() as the callback
- *
- * segvn_create() creates the new segment and calls VOP_ADDMAP()
- *
- * zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
- size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- segvn_crargs_t vn_a;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if ((prot & PROT_WRITE) &&
- (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY |
- ZFS_APPENDONLY))) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- if ((prot & (PROT_READ | PROT_EXEC)) &&
- (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) {
- ZFS_EXIT(zfsvfs);
- return (EACCES);
- }
-
- if (vp->v_flag & VNOMAP) {
- ZFS_EXIT(zfsvfs);
- return (ENOSYS);
- }
-
- if (off < 0 || len > MAXOFFSET_T - off) {
- ZFS_EXIT(zfsvfs);
- return (ENXIO);
- }
-
- if (vp->v_type != VREG) {
- ZFS_EXIT(zfsvfs);
- return (ENODEV);
- }
-
- /*
- * If file is locked, disallow mapping.
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
- ZFS_EXIT(zfsvfs);
- return (EAGAIN);
- }
-
- as_rangelock(as);
- error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
- if (error != 0) {
- as_rangeunlock(as);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vn_a.vp = vp;
- vn_a.offset = (u_offset_t)off;
- vn_a.type = flags & MAP_TYPE;
- vn_a.prot = prot;
- vn_a.maxprot = maxprot;
- vn_a.cred = cr;
- vn_a.amp = NULL;
- vn_a.flags = flags & ~MAP_TYPE;
- vn_a.szc = 0;
- vn_a.lgrp_mem_policy_flags = 0;
-
- error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
- as_rangeunlock(as);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
- size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- uint64_t pages = btopr(len);
-
- atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
- return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file. Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics. If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed. The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * open()
- * mmap()
- * <modify memory>
- * munmap()
- * close()
- * <time lapse>
- * putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future. In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
- size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- uint64_t pages = btopr(len);
-
- ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
- atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
- return (0);
-}
-
-/*
- * Free or allocate space in a file. Currently, this function only
- * supports the `F_FREESP' command. However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- * IN: vp - vnode of file to free data in.
- * cmd - action to take (only F_FREESP supported).
- * bfp - section of file to free/alloc.
- * flag - current file open mode flags.
- * offset - current file offset.
- * cr - credentials of caller [UNUSED].
- * ct - caller context.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
- offset_t offset, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t off, len;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
-top:
- if (cmd != F_FREESP) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- if (error = convoff(vp, bfp, 0, offset)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (bfp->l_len < 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- off = bfp->l_start;
- len = bfp->l_len; /* 0 means from off to end of file */
-
- do {
- error = zfs_freesp(zp, off, len, flag, TRUE);
- /* NB: we already did dmu_tx_wait() if necessary */
- } while (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint32_t gen;
- uint64_t object = zp->z_id;
- zfid_short_t *zfid;
- int size, i;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- gen = (uint32_t)zp->z_gen;
-
- size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
- if (fidp->fid_len < size) {
- fidp->fid_len = size;
- ZFS_EXIT(zfsvfs);
- return (ENOSPC);
- }
-
- zfid = (zfid_short_t *)fidp;
-
- zfid->zf_len = size;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- /* Must have a non-zero generation number to distinguish from .zfs */
- if (gen == 0)
- gen = 1;
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
- if (size == LONG_FID_LEN) {
- uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
- zfid_long_t *zlfid;
-
- zlfid = (zfid_long_t *)fidp;
-
- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
- zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
-
- /* XXX - this should be the generation number for the objset */
- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
- zlfid->zf_setgen[i] = 0;
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-static int
-zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp, *xzp;
- zfsvfs_t *zfsvfs;
- zfs_dirlock_t *dl;
- int error;
-
- switch (cmd) {
- case _PC_LINK_MAX:
- *valp = ULONG_MAX;
- return (0);
-
- case _PC_FILESIZEBITS:
- *valp = 64;
- return (0);
-
- case _PC_XATTR_EXISTS:
- zp = VTOZ(vp);
- zfsvfs = zp->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- *valp = 0;
- error = zfs_dirent_lock(&dl, zp, "", &xzp,
- ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
- if (error == 0) {
- zfs_dirent_unlock(dl);
- if (!zfs_dirempty(xzp))
- *valp = 1;
- VN_RELE(ZTOV(xzp));
- } else if (error == ENOENT) {
- /*
- * If there aren't extended attributes, it's the
- * same as having zero of them.
- */
- error = 0;
- }
- ZFS_EXIT(zfsvfs);
- return (error);
-
- case _PC_SATTR_ENABLED:
- case _PC_SATTR_EXISTS:
- *valp = vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) &&
- (vp->v_type == VREG || vp->v_type == VDIR);
- return (0);
-
- case _PC_ACL_ENABLED:
- *valp = _ACL_ACE_ENABLED;
- return (0);
-
- case _PC_MIN_HOLE_SIZE:
- *valp = (ulong_t)SPA_MINBLOCKSIZE;
- return (0);
-
- default:
- return (fs_pathconf(vp, cmd, valp, cr, ct));
- }
-}
-
-/*ARGSUSED*/
-static int
-zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- error = zfs_getacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- error = zfs_setacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-
-static int
-zfs_inval()
-{
- return (EINVAL);
-}
-
-static int
-zfs_isdir()
-{
- return (EISDIR);
-}
-/*
- * Directory vnode operations template
- */
-vnodeops_t *zfs_dvnodeops;
-const fs_operation_def_t zfs_dvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_READ, { .error = zfs_isdir },
- VOPNAME_WRITE, { .error = zfs_isdir },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_CREATE, { .vop_create = zfs_create },
- VOPNAME_REMOVE, { .vop_remove = zfs_remove },
- VOPNAME_LINK, { .vop_link = zfs_link },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
- VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
- VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
- VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Regular file vnode operations template
- */
-vnodeops_t *zfs_fvnodeops;
-const fs_operation_def_t zfs_fvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_READ, { .vop_read = zfs_read },
- VOPNAME_WRITE, { .vop_write = zfs_write },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
- VOPNAME_SPACE, { .vop_space = zfs_space },
- VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
- VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
- VOPNAME_MAP, { .vop_map = zfs_map },
- VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
- VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Symbolic link vnode operations template
- */
-vnodeops_t *zfs_symvnodeops;
-const fs_operation_def_t zfs_symvnodeops_template[] = {
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Extended attribute directory vnode operations template
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- * VOP_MKDIR()
- * VOP_SYMLINK()
- * Note that there are other restrictions embedded in:
- * zfs_create() - restrict type to VREG
- * zfs_link() - no links into/out of attribute space
- * zfs_rename() - no moves into/out of attribute space
- */
-vnodeops_t *zfs_xdvnodeops;
-const fs_operation_def_t zfs_xdvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_CREATE, { .vop_create = zfs_create },
- VOPNAME_REMOVE, { .vop_remove = zfs_remove },
- VOPNAME_LINK, { .vop_link = zfs_link },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_MKDIR, { .error = zfs_inval },
- VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
- VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
- VOPNAME_SYMLINK, { .error = zfs_inval },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Error vnode operations template
- */
-vnodeops_t *zfs_evnodeops;
-const fs_operation_def_t zfs_evnodeops_template[] = {
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- NULL, NULL
-};
diff --git a/zfs/lib/libdmu-ctl/zvol.c b/zfs/lib/libdmu-ctl/zvol.c
deleted file mode 100644
index 5d16a4d1f..000000000
--- a/zfs/lib/libdmu-ctl/zvol.c
+++ /dev/null
@@ -1,1830 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "@(#)zvol.c 1.31 08/04/09 SMI"
-
-/*
- * ZFS volume emulation driver.
- *
- * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
- * Volumes are accessed through the symbolic links named:
- *
- * /dev/zvol/dsk/<pool_name>/<dataset_name>
- * /dev/zvol/rdsk/<pool_name>/<dataset_name>
- *
- * These links are created by the ZFS-specific devfsadm link generator.
- * Volumes are persistent through reboot. No user command needs to be
- * run before opening and using a device.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/buf.h>
-#include <sys/modctl.h>
-#include <sys/open.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dnode.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dkio.h>
-#include <sys/efi_partition.h>
-#include <sys/byteorder.h>
-#include <sys/pathname.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/crc32.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/mkdev.h>
-#include <sys/zil.h>
-#include <sys/refcount.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_rlock.h>
-#include <sys/vdev_disk.h>
-#include <sys/vdev_impl.h>
-#include <sys/zvol.h>
-#include <sys/dumphdr.h>
-
-#include "zfs_namecheck.h"
-
-static void *zvol_state;
-
-#define ZVOL_DUMPSIZE "dumpsize"
-
-/*
- * This lock protects the zvol_state structure from being modified
- * while it's being used, e.g. an open that comes in before a create
- * finishes. It also protects temporary opens of the dataset so that,
- * e.g., an open doesn't get a spurious EBUSY.
- */
-static kmutex_t zvol_state_lock;
-static uint32_t zvol_minors;
-
-#define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t))
-
-typedef struct zvol_extent {
- dva_t ze_dva; /* dva associated with this extent */
- uint64_t ze_stride; /* extent stride */
- uint64_t ze_size; /* number of blocks in extent */
-} zvol_extent_t;
-
-/*
- * The list of extents associated with the dump device
- */
-typedef struct zvol_ext_list {
- zvol_extent_t zl_extents[NUM_EXTENTS];
- struct zvol_ext_list *zl_next;
-} zvol_ext_list_t;
-
-/*
- * The in-core state of each volume.
- */
-typedef struct zvol_state {
- char zv_name[MAXPATHLEN]; /* pool/dd name */
- uint64_t zv_volsize; /* amount of space we advertise */
- uint64_t zv_volblocksize; /* volume block size */
- minor_t zv_minor; /* minor number */
- uint8_t zv_min_bs; /* minimum addressable block shift */
- uint8_t zv_flags; /* readonly; dumpified */
- objset_t *zv_objset; /* objset handle */
- uint32_t zv_mode; /* DS_MODE_* flags at open time */
- uint32_t zv_open_count[OTYPCNT]; /* open counts */
- uint32_t zv_total_opens; /* total open count */
- zilog_t *zv_zilog; /* ZIL handle */
- zvol_ext_list_t *zv_list; /* List of extents for dump */
- uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
- znode_t zv_znode; /* for range locking */
-} zvol_state_t;
-
-/*
- * zvol specific flags
- */
-#define ZVOL_RDONLY 0x1
-#define ZVOL_DUMPIFIED 0x2
-
-/*
- * zvol maximum transfer in one DMU tx.
- */
-int zvol_maxphys = DMU_MAX_ACCESS/2;
-
-extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
-static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
-static int zvol_dumpify(zvol_state_t *zv);
-static int zvol_dump_fini(zvol_state_t *zv);
-static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
-
-static void
-zvol_size_changed(zvol_state_t *zv, major_t maj)
-{
- dev_t dev = makedevice(maj, zv->zv_minor);
-
- VERIFY(ddi_prop_update_int64(dev, zfs_dip,
- "Size", zv->zv_volsize) == DDI_SUCCESS);
- VERIFY(ddi_prop_update_int64(dev, zfs_dip,
- "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
-
- /* Notify specfs to invalidate the cached size */
- spec_size_invalidate(dev, VBLK);
- spec_size_invalidate(dev, VCHR);
-}
-
-int
-zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
-{
- if (volsize == 0)
- return (EINVAL);
-
- if (volsize % blocksize != 0)
- return (EINVAL);
-
-#ifdef _ILP32
- if (volsize - 1 > SPEC_MAXOFFSET_T)
- return (EOVERFLOW);
-#endif
- return (0);
-}
-
-int
-zvol_check_volblocksize(uint64_t volblocksize)
-{
- if (volblocksize < SPA_MINBLOCKSIZE ||
- volblocksize > SPA_MAXBLOCKSIZE ||
- !ISP2(volblocksize))
- return (EDOM);
-
- return (0);
-}
-
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
- zvol_state_t *zv = arg;
-
- if (newval)
- zv->zv_flags |= ZVOL_RDONLY;
- else
- zv->zv_flags &= ~ZVOL_RDONLY;
-}
-
-int
-zvol_get_stats(objset_t *os, nvlist_t *nv)
-{
- int error;
- dmu_object_info_t doi;
- uint64_t val;
-
-
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
- if (error)
- return (error);
-
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
-
- error = dmu_object_info(os, ZVOL_OBJ, &doi);
-
- if (error == 0) {
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
- doi.doi_data_block_size);
- }
-
- return (error);
-}
-
-/*
- * Find a free minor number.
- */
-static minor_t
-zvol_minor_alloc(void)
-{
- minor_t minor;
-
- ASSERT(MUTEX_HELD(&zvol_state_lock));
-
- for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
- if (ddi_get_soft_state(zvol_state, minor) == NULL)
- return (minor);
-
- return (0);
-}
-
-static zvol_state_t *
-zvol_minor_lookup(const char *name)
-{
- minor_t minor;
- zvol_state_t *zv;
-
- ASSERT(MUTEX_HELD(&zvol_state_lock));
-
- for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
- zv = ddi_get_soft_state(zvol_state, minor);
- if (zv == NULL)
- continue;
- if (strcmp(zv->zv_name, name) == 0)
- break;
- }
-
- return (zv);
-}
-
-void
-zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp)
-{
- ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
- ze->ze_stride = 0;
- ze->ze_size = 1;
-}
-
-/* extent mapping arg */
-struct maparg {
- zvol_ext_list_t *ma_list;
- zvol_extent_t *ma_extent;
- int ma_gang;
-};
-
-/*ARGSUSED*/
-static int
-zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
- zbookmark_t *zb = &bc->bc_bookmark;
- blkptr_t *bp = &bc->bc_blkptr;
- void *data = bc->bc_data;
- dnode_phys_t *dnp = bc->bc_dnode;
- struct maparg *ma = (struct maparg *)arg;
- uint64_t stride;
-
- /* If there is an error, then keep trying to make progress */
- if (bc->bc_errno)
- return (ERESTART);
-
-#ifdef ZFS_DEBUG
- if (zb->zb_level == -1) {
- ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
- ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
- } else {
- ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
- ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
- }
-
- if (zb->zb_level > 0) {
- uint64_t fill = 0;
- blkptr_t *bpx, *bpend;
-
- for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
- bpx < bpend; bpx++) {
- if (bpx->blk_birth != 0) {
- fill += bpx->blk_fill;
- } else {
- ASSERT(bpx->blk_fill == 0);
- }
- }
- ASSERT3U(fill, ==, bp->blk_fill);
- }
-
- if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
- uint64_t fill = 0;
- dnode_phys_t *dnx, *dnend;
-
- for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
- dnx < dnend; dnx++) {
- if (dnx->dn_type != DMU_OT_NONE)
- fill++;
- }
- ASSERT3U(fill, ==, bp->blk_fill);
- }
-#endif
-
- if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE)
- return (0);
-
- /* Abort immediately if we have encountered gang blocks */
- if (BP_IS_GANG(bp)) {
- ma->ma_gang++;
- return (EINTR);
- }
-
- /* first time? */
- if (ma->ma_extent->ze_size == 0) {
- zvol_init_extent(ma->ma_extent, bp);
- return (0);
- }
-
- stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) -
- ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) +
- (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride));
- if (DVA_GET_VDEV(BP_IDENTITY(bp)) ==
- DVA_GET_VDEV(&ma->ma_extent->ze_dva)) {
- if (ma->ma_extent->ze_stride == 0) {
- /* second block in this extent */
- ma->ma_extent->ze_stride = stride;
- ma->ma_extent->ze_size++;
- return (0);
- } else if (ma->ma_extent->ze_stride == stride) {
- /*
- * the block we allocated has the same
- * stride
- */
- ma->ma_extent->ze_size++;
- return (0);
- }
- }
-
- /*
- * dtrace -n 'zfs-dprintf
- * /stringof(arg0) == "zvol.c"/
- * {
- * printf("%s: %s", stringof(arg1), stringof(arg3))
- * } '
- */
- dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n",
- ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride);
- dprintf_bp(bp, "%s", "next blkptr:");
- /* start a new extent */
- if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) {
- ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t),
- KM_SLEEP);
- ma->ma_list = ma->ma_list->zl_next;
- ma->ma_extent = &ma->ma_list->zl_extents[0];
- } else {
- ma->ma_extent++;
- }
- zvol_init_extent(ma->ma_extent, bp);
- return (0);
-}
-
-/* ARGSUSED */
-void
-zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
-{
- zfs_creat_t *zct = arg;
- nvlist_t *nvprops = zct->zct_props;
- int error;
- uint64_t volblocksize, volsize;
-
- VERIFY(nvlist_lookup_uint64(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
- if (nvlist_lookup_uint64(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
- volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
-
- /*
- * These properties must be removed from the list so the generic
- * property setting step won't apply to them.
- */
- VERIFY(nvlist_remove_all(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
- (void) nvlist_remove_all(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
-
- error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
- DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
-
- error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
- DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
-
- error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
- ASSERT(error == 0);
-}
-
-/*
- * Replay a TX_WRITE ZIL transaction that didn't get committed
- * after a system failure
- */
-static int
-zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
-{
- objset_t *os = zv->zv_objset;
- char *data = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t off = lr->lr_offset;
- uint64_t len = lr->lr_length;
- dmu_tx_t *tx;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
- error = dmu_tx_assign(tx, zv->zv_txg_assign);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_write(os, ZVOL_OBJ, off, len, data, tx);
- dmu_tx_commit(tx);
- }
-
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
-{
- return (ENOTSUP);
-}
-
-/*
- * Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
- */
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
- zvol_replay_err, /* 0 no such transaction type */
- zvol_replay_err, /* TX_CREATE */
- zvol_replay_err, /* TX_MKDIR */
- zvol_replay_err, /* TX_MKXATTR */
- zvol_replay_err, /* TX_SYMLINK */
- zvol_replay_err, /* TX_REMOVE */
- zvol_replay_err, /* TX_RMDIR */
- zvol_replay_err, /* TX_LINK */
- zvol_replay_err, /* TX_RENAME */
- zvol_replay_write, /* TX_WRITE */
- zvol_replay_err, /* TX_TRUNCATE */
- zvol_replay_err, /* TX_SETATTR */
- zvol_replay_err, /* TX_ACL */
-};
-
-/*
- * reconstruct dva that gets us to the desired offset (offset
- * is in bytes)
- */
-int
-zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva)
-{
- zvol_ext_list_t *zl;
- zvol_extent_t *ze;
- int idx;
- uint64_t tmp;
-
- if ((zl = zv->zv_list) == NULL)
- return (EIO);
- idx = 0;
- ze = &zl->zl_extents[0];
- while (offset >= ze->ze_size * zv->zv_volblocksize) {
- offset -= ze->ze_size * zv->zv_volblocksize;
-
- if (idx == NUM_EXTENTS - 1) {
- /* we've reached the end of this array */
- ASSERT(zl->zl_next != NULL);
- if (zl->zl_next == NULL)
- return (-1);
- zl = zl->zl_next;
- ze = &zl->zl_extents[0];
- idx = 0;
- } else {
- ze++;
- idx++;
- }
- }
- DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva));
- tmp = DVA_GET_OFFSET((&ze->ze_dva));
- tmp += (ze->ze_stride * (offset / zv->zv_volblocksize));
- DVA_SET_OFFSET(dva, tmp);
- return (0);
-}
-
-static void
-zvol_free_extents(zvol_state_t *zv)
-{
- zvol_ext_list_t *zl;
- zvol_ext_list_t *tmp;
-
- if (zv->zv_list != NULL) {
- zl = zv->zv_list;
- while (zl != NULL) {
- tmp = zl->zl_next;
- kmem_free(zl, sizeof (zvol_ext_list_t));
- zl = tmp;
- }
- zv->zv_list = NULL;
- }
-}
-
-int
-zvol_get_lbas(zvol_state_t *zv)
-{
- struct maparg ma;
- zvol_ext_list_t *zl;
- zvol_extent_t *ze;
- uint64_t blocks = 0;
- int err;
-
- ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP);
- ma.ma_extent = &ma.ma_list->zl_extents[0];
- ma.ma_gang = 0;
- zv->zv_list = ma.ma_list;
-
- err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma);
- if (err == EINTR && ma.ma_gang) {
- /*
- * We currently don't support dump devices when the pool
- * is so fragmented that our allocation has resulted in
- * gang blocks.
- */
- zvol_free_extents(zv);
- return (EFRAGS);
- }
- ASSERT3U(err, ==, 0);
-
- ze = &zl->zl_extents[0];
- while (ze) {
- blocks += ze->ze_size;
- if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) {
- zl = zl->zl_next;
- ze = &zl->zl_extents[0];
- } else {
- ze++;
- }
- }
- if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) {
- zvol_free_extents(zv);
- return (EIO);
- }
-
- return (0);
-}
-
-/*
- * Create a minor node (plus a whole lot more) for the specified volume.
- */
-int
-zvol_create_minor(const char *name, major_t maj)
-{
- zvol_state_t *zv;
- objset_t *os;
- dmu_object_info_t doi;
- uint64_t volsize;
- minor_t minor = 0;
- struct pathname linkpath;
- int ds_mode = DS_MODE_PRIMARY;
- vnode_t *vp = NULL;
- char *devpath;
- size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1;
- char chrbuf[30], blkbuf[30];
- int error;
-
- mutex_enter(&zvol_state_lock);
-
- if ((zv = zvol_minor_lookup(name)) != NULL) {
- mutex_exit(&zvol_state_lock);
- return (EEXIST);
- }
-
- if (strchr(name, '@') != 0)
- ds_mode |= DS_MODE_READONLY;
-
- error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
-
- if (error) {
- mutex_exit(&zvol_state_lock);
- return (error);
- }
-
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-
- if (error) {
- dmu_objset_close(os);
- mutex_exit(&zvol_state_lock);
- return (error);
- }
-
- /*
- * If there's an existing /dev/zvol symlink, try to use the
- * same minor number we used last time.
- */
- devpath = kmem_alloc(devpathlen, KM_SLEEP);
-
- (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name);
-
- error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
-
- kmem_free(devpath, devpathlen);
-
- if (error == 0 && vp->v_type != VLNK)
- error = EINVAL;
-
- if (error == 0) {
- pn_alloc(&linkpath);
- error = pn_getsymlink(vp, &linkpath, kcred);
- if (error == 0) {
- char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
- if (ms != NULL) {
- ms += strlen(ZVOL_PSEUDO_DEV);
- minor = stoi(&ms);
- }
- }
- pn_free(&linkpath);
- }
-
- if (vp != NULL)
- VN_RELE(vp);
-
- /*
- * If we found a minor but it's already in use, we must pick a new one.
- */
- if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
- minor = 0;
-
- if (minor == 0)
- minor = zvol_minor_alloc();
-
- if (minor == 0) {
- dmu_objset_close(os);
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
-
- if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
- dmu_objset_close(os);
- mutex_exit(&zvol_state_lock);
- return (EAGAIN);
- }
-
- (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
- (char *)name);
-
- (void) sprintf(chrbuf, "%uc,raw", minor);
-
- if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
- minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
- ddi_soft_state_free(zvol_state, minor);
- dmu_objset_close(os);
- mutex_exit(&zvol_state_lock);
- return (EAGAIN);
- }
-
- (void) sprintf(blkbuf, "%uc", minor);
-
- if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
- minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
- ddi_remove_minor_node(zfs_dip, chrbuf);
- ddi_soft_state_free(zvol_state, minor);
- dmu_objset_close(os);
- mutex_exit(&zvol_state_lock);
- return (EAGAIN);
- }
-
- zv = ddi_get_soft_state(zvol_state, minor);
-
- (void) strcpy(zv->zv_name, name);
- zv->zv_min_bs = DEV_BSHIFT;
- zv->zv_minor = minor;
- zv->zv_volsize = volsize;
- zv->zv_objset = os;
- zv->zv_mode = ds_mode;
- zv->zv_zilog = zil_open(os, zvol_get_data);
- mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
- sizeof (rl_t), offsetof(rl_t, r_node));
- /* get and cache the blocksize */
- error = dmu_object_info(os, ZVOL_OBJ, &doi);
- ASSERT(error == 0);
- zv->zv_volblocksize = doi.doi_data_block_size;
-
- zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
- zvol_size_changed(zv, maj);
-
- /* XXX this should handle the possible i/o error */
- VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
-
- zvol_minors++;
-
- mutex_exit(&zvol_state_lock);
-
- return (0);
-}
-
-/*
- * Remove minor node for the specified volume.
- */
-int
-zvol_remove_minor(const char *name)
-{
- zvol_state_t *zv;
- char namebuf[30];
-
- mutex_enter(&zvol_state_lock);
-
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
-
- if (zv->zv_total_opens != 0) {
- mutex_exit(&zvol_state_lock);
- return (EBUSY);
- }
-
- (void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
- ddi_remove_minor_node(zfs_dip, namebuf);
-
- (void) sprintf(namebuf, "%uc", zv->zv_minor);
- ddi_remove_minor_node(zfs_dip, namebuf);
-
- VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
-
- zil_close(zv->zv_zilog);
- zv->zv_zilog = NULL;
- dmu_objset_close(zv->zv_objset);
- zv->zv_objset = NULL;
- avl_destroy(&zv->zv_znode.z_range_avl);
- mutex_destroy(&zv->zv_znode.z_range_lock);
-
- ddi_soft_state_free(zvol_state, zv->zv_minor);
-
- zvol_minors--;
-
- mutex_exit(&zvol_state_lock);
-
- return (0);
-}
-
-static int
-zvol_truncate(zvol_state_t *zv, uint64_t offset, uint64_t size)
-{
- dmu_tx_t *tx;
- int error;
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_free(tx, ZVOL_OBJ, offset, size);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
- }
- error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, offset, size, tx);
- dmu_tx_commit(tx);
- return (0);
-}
-
-int
-zvol_prealloc(zvol_state_t *zv)
-{
- objset_t *os = zv->zv_objset;
- dmu_tx_t *tx;
- void *data;
- uint64_t refd, avail, usedobjs, availobjs;
- uint64_t resid = zv->zv_volsize;
- uint64_t off = 0;
-
- /* Check the space usage before attempting to allocate the space */
- dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
- if (avail < zv->zv_volsize)
- return (ENOSPC);
-
- /* Free old extents if they exist */
- zvol_free_extents(zv);
-
- /* allocate the blocks by writing each one */
- data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
-
- while (resid != 0) {
- int error;
- uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- kmem_free(data, SPA_MAXBLOCKSIZE);
- (void) zvol_truncate(zv, 0, off);
- return (error);
- }
- dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
- dmu_tx_commit(tx);
- off += bytes;
- resid -= bytes;
- }
- kmem_free(data, SPA_MAXBLOCKSIZE);
- txg_wait_synced(dmu_objset_pool(os), 0);
-
- return (0);
-}
-
-int
-zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
-{
- dmu_tx_t *tx;
- int error;
-
- ASSERT(MUTEX_HELD(&zvol_state_lock));
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
- dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
- }
-
- error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
- &volsize, tx);
- dmu_tx_commit(tx);
-
- if (error == 0)
- error = zvol_truncate(zv, volsize, DMU_OBJECT_END);
-
- if (error == 0) {
- zv->zv_volsize = volsize;
- zvol_size_changed(zv, maj);
- }
- return (error);
-}
-
-int
-zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
-{
- zvol_state_t *zv;
- int error;
- dmu_object_info_t doi;
- uint64_t old_volsize = 0ULL;
-
- mutex_enter(&zvol_state_lock);
-
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
- old_volsize = zv->zv_volsize;
-
- if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
- (error = zvol_check_volsize(volsize,
- doi.doi_data_block_size)) != 0) {
- mutex_exit(&zvol_state_lock);
- return (error);
- }
-
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
- mutex_exit(&zvol_state_lock);
- return (EROFS);
- }
-
- error = zvol_update_volsize(zv, maj, volsize);
-
- /*
- * Reinitialize the dump area to the new size. If we
- * failed to resize the dump area then restore the it back to
- * it's original size.
- */
- if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
- if ((error = zvol_dumpify(zv)) != 0 ||
- (error = dumpvp_resize()) != 0) {
- (void) zvol_update_volsize(zv, maj, old_volsize);
- error = zvol_dumpify(zv);
- }
- }
-
- mutex_exit(&zvol_state_lock);
-
- return (error);
-}
-
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
-{
- zvol_state_t *zv;
- dmu_tx_t *tx;
- int error;
-
- mutex_enter(&zvol_state_lock);
-
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
- mutex_exit(&zvol_state_lock);
- return (EROFS);
- }
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_bonus(tx, ZVOL_OBJ);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
- volblocksize, 0, tx);
- if (error == ENOTSUP)
- error = EBUSY;
- dmu_tx_commit(tx);
- }
-
- mutex_exit(&zvol_state_lock);
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
-{
- minor_t minor = getminor(*devp);
- zvol_state_t *zv;
-
- if (minor == 0) /* This is the control device */
- return (0);
-
- mutex_enter(&zvol_state_lock);
-
- zv = ddi_get_soft_state(zvol_state, minor);
- if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
-
- ASSERT(zv->zv_objset != NULL);
-
- if ((flag & FWRITE) &&
- (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) {
- mutex_exit(&zvol_state_lock);
- return (EROFS);
- }
-
- if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
- zv->zv_open_count[otyp]++;
- zv->zv_total_opens++;
- }
-
- mutex_exit(&zvol_state_lock);
-
- return (0);
-}
-
-/*ARGSUSED*/
-int
-zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
-{
- minor_t minor = getminor(dev);
- zvol_state_t *zv;
-
- if (minor == 0) /* This is the control device */
- return (0);
-
- mutex_enter(&zvol_state_lock);
-
- zv = ddi_get_soft_state(zvol_state, minor);
- if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
-
- /*
- * The next statement is a workaround for the following DDI bug:
- * 6343604 specfs race: multiple "last-close" of the same device
- */
- if (zv->zv_total_opens == 0) {
- mutex_exit(&zvol_state_lock);
- return (0);
- }
-
- /*
- * If the open count is zero, this is a spurious close.
- * That indicates a bug in the kernel / DDI framework.
- */
- ASSERT(zv->zv_open_count[otyp] != 0);
- ASSERT(zv->zv_total_opens != 0);
-
- /*
- * You may get multiple opens, but only one close.
- */
- zv->zv_open_count[otyp]--;
- zv->zv_total_opens--;
-
- mutex_exit(&zvol_state_lock);
-
- return (0);
-}
-
-static void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
-{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
-
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
-{
- zvol_state_t *zv = arg;
- objset_t *os = zv->zv_objset;
- dmu_buf_t *db;
- rl_t *rl;
- zgd_t *zgd;
- uint64_t boff; /* block starting offset */
- int dlen = lr->lr_length; /* length of user data */
- int error;
-
- ASSERT(zio);
- ASSERT(dlen != 0);
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) /* immediate write */
- return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
-
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_bp = &lr->lr_blkptr;
-
- /*
- * Lock the range of the block to ensure that when the data is
- * written out and its checksum is being calculated that no other
- * thread can change the block.
- */
- boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
- rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
- RL_READER);
- zgd->zgd_rl = rl;
-
- VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zvol_get_done, zgd);
- if (error == 0)
- zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zvol_get_done() callback.
- */
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
- zfs_range_unlock(rl);
- kmem_free(zgd, sizeof (zgd_t));
- return (error);
-}
-
-/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
- */
-ssize_t zvol_immediate_write_sz = 32768;
-
-static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
-{
- uint32_t blocksize = zv->zv_volblocksize;
- lr_write_t *lr;
-
- while (len) {
- ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
- itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-
- itx->itx_wr_state =
- len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
- itx->itx_private = zv;
- lr = (lr_write_t *)&itx->itx_lr;
- lr->lr_foid = ZVOL_OBJ;
- lr->lr_offset = off;
- lr->lr_length = nbytes;
- lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
- BP_ZERO(&lr->lr_blkptr);
-
- (void) zil_itx_assign(zv->zv_zilog, itx, tx);
- len -= nbytes;
- off += nbytes;
- }
-}
-
-int
-zvol_dumpio(vdev_t *vd, uint64_t size, uint64_t offset, void *addr,
- int bflags, int isdump)
-{
- vdev_disk_t *dvd;
- int direction;
- int c;
- int numerrors = 0;
-
- for (c = 0; c < vd->vdev_children; c++) {
- if (zvol_dumpio(vd->vdev_child[c], size, offset,
- addr, bflags, isdump) != 0) {
- numerrors++;
- } else if (bflags & B_READ) {
- break;
- }
- }
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return (numerrors < vd->vdev_children ? 0 : EIO);
-
- if (!vdev_writeable(vd))
- return (EIO);
-
- dvd = vd->vdev_tsd;
- ASSERT3P(dvd, !=, NULL);
- direction = bflags & (B_WRITE | B_READ);
- ASSERT(ISP2(direction));
- offset += VDEV_LABEL_START_SIZE;
-
- if (ddi_in_panic() || isdump) {
- if (direction & B_READ)
- return (EIO);
- return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
- lbtodb(size)));
- } else {
- return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
- direction));
- }
-}
-
-int
-zvol_physio(zvol_state_t *zv, int bflags, uint64_t off,
- uint64_t size, void *addr, int isdump)
-{
- dva_t dva;
- vdev_t *vd;
- int error;
- spa_t *spa = dmu_objset_spa(zv->zv_objset);
-
- ASSERT(size <= zv->zv_volblocksize);
-
- /* restrict requests to multiples of the system block size */
- if (P2PHASE(off, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE))
- return (EINVAL);
-
- if (zvol_get_dva(zv, off, &dva) != 0)
- return (EIO);
-
- spa_config_enter(spa, RW_READER, FTAG);
- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
-
- error = zvol_dumpio(vd, size,
- DVA_GET_OFFSET(&dva) + (off % zv->zv_volblocksize),
- addr, bflags & (B_READ | B_WRITE | B_PHYS), isdump);
-
- spa_config_exit(spa, FTAG);
- return (error);
-}
-
-int
-zvol_strategy(buf_t *bp)
-{
- zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
- uint64_t off, volsize;
- size_t size, resid;
- char *addr;
- objset_t *os;
- rl_t *rl;
- int error = 0;
- boolean_t reading, is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
-
- if (zv == NULL) {
- bioerror(bp, ENXIO);
- biodone(bp);
- return (0);
- }
-
- if (getminor(bp->b_edev) == 0) {
- bioerror(bp, EINVAL);
- biodone(bp);
- return (0);
- }
-
- if (!(bp->b_flags & B_READ) &&
- (zv->zv_flags & ZVOL_RDONLY ||
- zv->zv_mode & DS_MODE_READONLY)) {
- bioerror(bp, EROFS);
- biodone(bp);
- return (0);
- }
-
- off = ldbtob(bp->b_blkno);
- volsize = zv->zv_volsize;
-
- os = zv->zv_objset;
- ASSERT(os != NULL);
-
- bp_mapin(bp);
- addr = bp->b_un.b_addr;
- resid = bp->b_bcount;
-
- /*
- * There must be no buffer changes when doing a dmu_sync() because
- * we can't change the data whilst calculating the checksum.
- */
- reading = bp->b_flags & B_READ;
- rl = zfs_range_lock(&zv->zv_znode, off, resid,
- reading ? RL_READER : RL_WRITER);
-
- if (resid > volsize - off) /* don't write past the end */
- resid = volsize - off;
-
- while (resid != 0 && off < volsize) {
-
- size = MIN(resid, zvol_maxphys);
- if (is_dump) {
- /* can't straddle a block boundary */
- size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
- error = zvol_physio(zv, bp->b_flags, off, size,
- addr, 0);
- } else if (reading) {
- error = dmu_read(os, ZVOL_OBJ, off, size, addr);
- } else {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- zvol_log_write(zv, tx, off, size);
- dmu_tx_commit(tx);
- }
- }
- if (error)
- break;
- off += size;
- addr += size;
- resid -= size;
- }
- zfs_range_unlock(rl);
-
- if ((bp->b_resid = resid) == bp->b_bcount)
- bioerror(bp, off > volsize ? EINVAL : error);
-
- if (!(bp->b_flags & B_ASYNC) && !reading && !zil_disable && !is_dump)
- zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
- biodone(bp);
-
- return (0);
-}
-
-/*
- * Set the buffer count to the zvol maximum transfer.
- * Using our own routine instead of the default minphys()
- * means that for larger writes we write bigger buffers on X86
- * (128K instead of 56K) and flush the disk write cache less often
- * (every zvol_maxphys - currently 1MB) instead of minphys (currently
- * 56K on X86 and 128K on sparc).
- */
-void
-zvol_minphys(struct buf *bp)
-{
- if (bp->b_bcount > zvol_maxphys)
- bp->b_bcount = zvol_maxphys;
-}
-
-int
-zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
-{
- minor_t minor = getminor(dev);
- zvol_state_t *zv;
- int error = 0;
- uint64_t size;
- uint64_t boff;
- uint64_t resid;
-
- if (minor == 0) /* This is the control device */
- return (ENXIO);
-
- zv = ddi_get_soft_state(zvol_state, minor);
- if (zv == NULL)
- return (ENXIO);
-
- boff = ldbtob(blkno);
- resid = ldbtob(nblocks);
- if (boff + resid > zv->zv_volsize) {
- /* dump should know better than to write here */
- ASSERT(blkno + resid <= zv->zv_volsize);
- return (EIO);
- }
- while (resid) {
- /* can't straddle a block boundary */
- size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
-
- error = zvol_physio(zv, B_WRITE, boff, size, addr, 1);
- if (error)
- break;
- boff += size;
- addr += size;
- resid -= size;
- }
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
-{
- minor_t minor = getminor(dev);
- zvol_state_t *zv;
- rl_t *rl;
- int error = 0;
-
- if (minor == 0) /* This is the control device */
- return (ENXIO);
-
- zv = ddi_get_soft_state(zvol_state, minor);
- if (zv == NULL)
- return (ENXIO);
-
- rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
- RL_READER);
- while (uio->uio_resid > 0) {
- uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
-
- error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
- if (error)
- break;
- }
- zfs_range_unlock(rl);
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
-{
- minor_t minor = getminor(dev);
- zvol_state_t *zv;
- rl_t *rl;
- int error = 0;
-
- if (minor == 0) /* This is the control device */
- return (ENXIO);
-
- zv = ddi_get_soft_state(zvol_state, minor);
- if (zv == NULL)
- return (ENXIO);
-
- if (zv->zv_flags & ZVOL_DUMPIFIED) {
- error = physio(zvol_strategy, NULL, dev, B_WRITE,
- zvol_minphys, uio);
- return (error);
- }
-
- rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
- RL_WRITER);
- while (uio->uio_resid > 0) {
- uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
- uint64_t off = uio->uio_loffset;
-
- dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- break;
- }
- error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
- if (error == 0)
- zvol_log_write(zv, tx, off, bytes);
- dmu_tx_commit(tx);
-
- if (error)
- break;
- }
- zfs_range_unlock(rl);
- return (error);
-}
-
-/*
- * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
- */
-/*ARGSUSED*/
-int
-zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
-{
- zvol_state_t *zv;
- struct dk_cinfo dki;
- struct dk_minfo dkm;
- dk_efi_t efi;
- struct dk_callback *dkc;
- struct uuid uuid = EFI_RESERVED;
- uint32_t crc;
- int error = 0;
- rl_t *rl;
-
- mutex_enter(&zvol_state_lock);
-
- zv = ddi_get_soft_state(zvol_state, getminor(dev));
-
- if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
- return (ENXIO);
- }
-
- switch (cmd) {
-
- case DKIOCINFO:
- bzero(&dki, sizeof (dki));
- (void) strcpy(dki.dki_cname, "zvol");
- (void) strcpy(dki.dki_dname, "zvol");
- dki.dki_ctype = DKC_UNKNOWN;
- dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
- mutex_exit(&zvol_state_lock);
- if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
- error = EFAULT;
- return (error);
-
- case DKIOCGMEDIAINFO:
- bzero(&dkm, sizeof (dkm));
- dkm.dki_lbsize = 1U << zv->zv_min_bs;
- dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
- dkm.dki_media_type = DK_UNKNOWN;
- mutex_exit(&zvol_state_lock);
- if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
- error = EFAULT;
- return (error);
-
- case DKIOCGETEFI:
- if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
- mutex_exit(&zvol_state_lock);
- return (EFAULT);
- }
- efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
-
- /*
- * Some clients may attempt to request a PMBR for the
- * zvol. Currently this interface will return ENOTTY to
- * such requests. These requests could be supported by
- * adding a check for lba == 0 and consing up an appropriate
- * PMBR.
- */
- if (efi.dki_lba == 1) {
- efi_gpt_t gpt;
- efi_gpe_t gpe;
-
- bzero(&gpt, sizeof (gpt));
- bzero(&gpe, sizeof (gpe));
-
- if (efi.dki_length < sizeof (gpt)) {
- mutex_exit(&zvol_state_lock);
- return (EINVAL);
- }
-
- gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
- gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
- gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
- gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
- gpt.efi_gpt_LastUsableLBA =
- LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
- gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
- gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
- gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
-
- UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
- gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
- gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
-
- CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
- gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
-
- CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
- gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
-
- mutex_exit(&zvol_state_lock);
- if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag))
- error = EFAULT;
- } else if (efi.dki_lba == 2) {
- efi_gpe_t gpe;
-
- bzero(&gpe, sizeof (gpe));
-
- if (efi.dki_length < sizeof (gpe)) {
- mutex_exit(&zvol_state_lock);
- return (EINVAL);
- }
-
- UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
- gpe.efi_gpe_StartingLBA = LE_64(34ULL);
- gpe.efi_gpe_EndingLBA =
- LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
-
- mutex_exit(&zvol_state_lock);
- if (ddi_copyout(&gpe, efi.dki_data, sizeof (gpe), flag))
- error = EFAULT;
- } else {
- mutex_exit(&zvol_state_lock);
- error = EINVAL;
- }
- return (error);
-
- case DKIOCFLUSHWRITECACHE:
- dkc = (struct dk_callback *)arg;
- zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
- if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
- (*dkc->dkc_callback)(dkc->dkc_cookie, error);
- error = 0;
- }
- break;
-
- case DKIOCGGEOM:
- case DKIOCGVTOC:
- /*
- * commands using these (like prtvtoc) expect ENOTSUP
- * since we're emulating an EFI label
- */
- error = ENOTSUP;
- break;
-
- case DKIOCDUMPINIT:
- rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
- RL_WRITER);
- error = zvol_dumpify(zv);
- zfs_range_unlock(rl);
- break;
-
- case DKIOCDUMPFINI:
- rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
- RL_WRITER);
- error = zvol_dump_fini(zv);
- zfs_range_unlock(rl);
- break;
-
- default:
- error = ENOTTY;
- break;
-
- }
- mutex_exit(&zvol_state_lock);
- return (error);
-}
-
-int
-zvol_busy(void)
-{
- return (zvol_minors != 0);
-}
-
-void
-zvol_init(void)
-{
- VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
- mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-zvol_fini(void)
-{
- mutex_destroy(&zvol_state_lock);
- ddi_soft_state_fini(&zvol_state);
-}
-
-static boolean_t
-zvol_is_swap(zvol_state_t *zv)
-{
- vnode_t *vp;
- boolean_t ret = B_FALSE;
- char *devpath;
- size_t devpathlen;
- int error;
-
- devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
- devpath = kmem_alloc(devpathlen, KM_SLEEP);
- (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
- error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
- kmem_free(devpath, devpathlen);
-
- ret = !error && IS_SWAPVP(common_specvp(vp));
-
- if (vp != NULL)
- VN_RELE(vp);
-
- return (ret);
-}
-
-static int
-zvol_dump_init(zvol_state_t *zv, boolean_t resize)
-{
- dmu_tx_t *tx;
- int error = 0;
- objset_t *os = zv->zv_objset;
- nvlist_t *nv = NULL;
- uint64_t checksum, compress, refresrv;
-
- ASSERT(MUTEX_HELD(&zvol_state_lock));
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, ZVOL_OBJ, 0, DMU_OBJECT_END);
- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
- }
-
- /*
- * If we are resizing the dump device then we only need to
- * update the refreservation to match the newly updated
- * zvolsize. Otherwise, we save off the original state of the
- * zvol so that we can restore them if the zvol is ever undumpified.
- */
- if (resize) {
- error = zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
- &zv->zv_volsize, tx);
- } else {
- error = dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
-
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
- &compress, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
- &refresrv, tx);
- }
- dmu_tx_commit(tx);
-
- /* Truncate the file */
- if (!error)
- error = zvol_truncate(zv, 0, DMU_OBJECT_END);
-
- if (error)
- return (error);
-
- /*
- * We only need update the zvol's property if we are initializing
- * the dump area for the first time.
- */
- if (!resize) {
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
- VERIFY(nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION),
- ZIO_COMPRESS_OFF) == 0);
- VERIFY(nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM),
- ZIO_CHECKSUM_OFF) == 0);
-
- error = zfs_set_prop_nvlist(zv->zv_name, nv);
- nvlist_free(nv);
-
- if (error)
- return (error);
- }
-
- /* Allocate the space for the dump */
- error = zvol_prealloc(zv);
- return (error);
-}
-
-static int
-zvol_dumpify(zvol_state_t *zv)
-{
- int error = 0;
- uint64_t dumpsize = 0;
- dmu_tx_t *tx;
- objset_t *os = zv->zv_objset;
-
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))
- return (EROFS);
-
- /*
- * We do not support swap devices acting as dump devices.
- */
- if (zvol_is_swap(zv))
- return (ENOTSUP);
-
- if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
- 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
- boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
-
- if ((error = zvol_dump_init(zv, resize)) != 0) {
- (void) zvol_dump_fini(zv);
- return (error);
- }
- }
-
- /*
- * Build up our lba mapping.
- */
- error = zvol_get_lbas(zv);
- if (error) {
- (void) zvol_dump_fini(zv);
- return (error);
- }
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- (void) zvol_dump_fini(zv);
- return (error);
- }
-
- zv->zv_flags |= ZVOL_DUMPIFIED;
- error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
- &zv->zv_volsize, tx);
- dmu_tx_commit(tx);
-
- if (error) {
- (void) zvol_dump_fini(zv);
- return (error);
- }
-
- txg_wait_synced(dmu_objset_pool(os), 0);
- return (0);
-}
-
-static int
-zvol_dump_fini(zvol_state_t *zv)
-{
- dmu_tx_t *tx;
- objset_t *os = zv->zv_objset;
- nvlist_t *nv;
- int error = 0;
- uint64_t checksum, compress, refresrv;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
- }
-
- /*
- * Attempt to restore the zvol back to its pre-dumpified state.
- * This is a best-effort attempt as it's possible that not all
- * of these properties were initialized during the dumpify process
- * (i.e. error during zvol_dump_init).
- */
- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
-
- (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
- zvol_free_extents(zv);
- zv->zv_flags &= ~ZVOL_DUMPIFIED;
- dmu_tx_commit(tx);
-
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- (void) nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
- (void) nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
- (void) nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
- (void) zfs_set_prop_nvlist(zv->zv_name, nv);
- nvlist_free(nv);
-
- return (0);
-}