diff options
Diffstat (limited to 'zfs/lib/libdmu-ctl')
-rw-r--r-- | zfs/lib/libdmu-ctl/dctl_client.c | 263 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/dctl_common.c | 109 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/dctl_server.c | 476 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/dctl_thrpool.c | 253 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/dmu_send.c | 1249 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h | 71 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h | 144 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/rrwlock.c | 249 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_acl.c | 2641 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_ctldir.c | 1147 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_dir.c | 968 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_fuid.c | 688 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_ioctl.c | 3055 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_log.c | 693 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_replay.c | 876 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_rlock.c | 602 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_vfsops.c | 1671 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zfs_vnops.c | 4558 | ||||
-rw-r--r-- | zfs/lib/libdmu-ctl/zvol.c | 1830 |
19 files changed, 0 insertions, 21543 deletions
diff --git a/zfs/lib/libdmu-ctl/dctl_client.c b/zfs/lib/libdmu-ctl/dctl_client.c deleted file mode 100644 index e3d8f305b..000000000 --- a/zfs/lib/libdmu-ctl/dctl_client.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <ftw.h> -#include <errno.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/un.h> -#include <sys/debug.h> - -#include <sys/dmu_ctl.h> -#include <sys/dmu_ctl_impl.h> - -/* - * Try to connect to the socket given in path. - * - * For nftw() convenience, returns 0 if unsuccessful, otherwise - * returns the socket descriptor. - */ -static int try_connect(const char *path) -{ - struct sockaddr_un name; - int sock; - - sock = socket(PF_UNIX, SOCK_STREAM, 0); - if (sock == -1) { - perror("socket"); - return 0; - } - - /* - * The socket fd cannot be 0 otherwise nftw() will not interpret the - * return code correctly. - */ - VERIFY(sock != 0); - - name.sun_family = AF_UNIX; - strncpy(name.sun_path, path, sizeof(name.sun_path)); - - name.sun_path[sizeof(name.sun_path) - 1] = '\0'; - - if (connect(sock, (struct sockaddr *) &name, sizeof(name)) == -1) { - close(sock); - return 0; - } - - return sock; -} - -/* - * nftw() callback. - */ -static int nftw_cb(const char *fpath, const struct stat *sb, int typeflag, - struct FTW *ftwbuf) -{ - if (!S_ISSOCK(sb->st_mode)) - return 0; - - if (strcmp(&fpath[ftwbuf->base], SOCKNAME) != 0) - return 0; - - return try_connect(fpath); -} - -/* - * For convenience, if check_subdirs is true we walk the directory tree to - * find a good socket. - */ -int dctlc_connect(const char *dir, boolean_t check_subdirs) -{ - char *fpath; - int fd; - - if (check_subdirs) - fd = nftw(dir, nftw_cb, 10, FTW_PHYS); - else { - fpath = malloc(strlen(dir) + strlen(SOCKNAME) + 2); - if (fpath == NULL) - return -1; - - strcpy(fpath, dir); - strcat(fpath, "/" SOCKNAME); - - fd = try_connect(fpath); - - free(fpath); - } - - return fd == 0 ? -1 : fd; -} - -void dctlc_disconnect(int fd) -{ - (void) shutdown(fd, SHUT_RDWR); -} - -static int dctl_reply_copyin(int fd, dctl_cmd_t *cmd) -{ - return dctl_send_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr, - cmd->u.dcmd_copy.size); -} - -static int dctl_reply_copyinstr(int fd, dctl_cmd_t *cmd) -{ - dctl_cmd_t reply; - char *from; - size_t len, buflen, to_copy; - int error; - - reply.dcmd_msg = DCTL_GEN_REPLY; - - from = (char *)(uintptr_t) cmd->u.dcmd_copy.ptr; - - buflen = cmd->u.dcmd_copy.size; - to_copy = strnlen(from, buflen - 1); - - reply.u.dcmd_reply.rc = from[to_copy] == '\0' ? 0 : ENAMETOOLONG; - reply.u.dcmd_reply.size = to_copy; - - error = dctl_send_msg(fd, &reply); - - if (!error && to_copy > 0) - error = dctl_send_data(fd, from, to_copy); - - return error; -} - -static int dctl_reply_copyout(int fd, dctl_cmd_t *cmd) -{ - return dctl_read_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr, - cmd->u.dcmd_copy.size); -} - -static int dctl_reply_fd_read(int fd, dctl_cmd_t *cmd) -{ - dctl_cmd_t reply; - void *buf; - int error; - ssize_t rrc, size = cmd->u.dcmd_fd_io.size; - - buf = malloc(size); - if (buf == NULL) - return ENOMEM; - - rrc = read(cmd->u.dcmd_fd_io.fd, buf, size); - - reply.dcmd_msg = DCTL_GEN_REPLY; - reply.u.dcmd_reply.rc = rrc == -1 ? errno : 0; - reply.u.dcmd_reply.size = rrc; - - error = dctl_send_msg(fd, &reply); - - if (!error && rrc > 0) - error = dctl_send_data(fd, buf, rrc); - -out: - free(buf); - - return error; -} - -static int dctl_reply_fd_write(int fd, dctl_cmd_t *cmd) -{ - dctl_cmd_t reply; - void *buf; - int error; - ssize_t wrc, size = cmd->u.dcmd_fd_io.size; - - buf = malloc(size); - if (buf == NULL) - return ENOMEM; - - error = dctl_read_data(fd, buf, size); - if (error) - goto out; - - wrc = write(cmd->u.dcmd_fd_io.fd, buf, size); - - reply.dcmd_msg = DCTL_GEN_REPLY; - reply.u.dcmd_reply.rc = wrc == -1 ? errno : 0; - reply.u.dcmd_reply.size = wrc; - - error = dctl_send_msg(fd, &reply); - -out: - free(buf); - - return error; -} - -int dctlc_ioctl(int fd, int32_t request, void *arg) -{ - int error; - dctl_cmd_t cmd; - - ASSERT(fd != 0); - - cmd.dcmd_msg = DCTL_IOCTL; - - cmd.u.dcmd_ioctl.cmd = request; - cmd.u.dcmd_ioctl.arg = (uintptr_t) arg; - - error = dctl_send_msg(fd, &cmd); - - while (!error && (error = dctl_read_msg(fd, &cmd)) == 0) { - switch (cmd.dcmd_msg) { - case DCTL_IOCTL_REPLY: - error = cmd.u.dcmd_reply.rc; - goto out; - case DCTL_COPYIN: - error = dctl_reply_copyin(fd, &cmd); - break; - case DCTL_COPYINSTR: - error = dctl_reply_copyinstr(fd, &cmd); - break; - case DCTL_COPYOUT: - error = dctl_reply_copyout(fd, &cmd); - break; - case DCTL_FD_READ: - error = dctl_reply_fd_read(fd, &cmd); - break; - case DCTL_FD_WRITE: - error = dctl_reply_fd_write(fd, &cmd); - break; - default: - fprintf(stderr, "%s(): invalid message " - "received.\n", __func__); - error = EINVAL; - goto out; - } - } - -out: - errno = error; - return error ? -1 : 0; -} diff --git a/zfs/lib/libdmu-ctl/dctl_common.c b/zfs/lib/libdmu-ctl/dctl_common.c deleted file mode 100644 index 8de37dcb1..000000000 --- a/zfs/lib/libdmu-ctl/dctl_common.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <stdio.h> -#include <errno.h> -#include <sys/types.h> -#include <sys/socket.h> - -#include <sys/dmu_ctl.h> -#include <sys/dmu_ctl_impl.h> - -int dctl_read_msg(int fd, dctl_cmd_t *cmd) -{ - int error; - - /* - * First, read only the magic number and the protocol version. - * - * This prevents blocking forever in case the size of dctl_cmd_t - * shrinks in future protocol versions. - */ - error = dctl_read_data(fd, cmd, DCTL_CMD_HEADER_SIZE); - - if (!error &&cmd->dcmd_magic != DCTL_MAGIC) { - fprintf(stderr, "%s(): invalid magic number\n", __func__); - error = EIO; - } - - if (!error && cmd->dcmd_version != DCTL_PROTOCOL_VER) { - fprintf(stderr, "%s(): invalid protocol version\n", __func__); - error = ENOTSUP; - } - - if (error) - return error; - - /* Get the rest of the command */ - return dctl_read_data(fd, (caddr_t) cmd + DCTL_CMD_HEADER_SIZE, - sizeof(dctl_cmd_t) - DCTL_CMD_HEADER_SIZE); -} - -int dctl_send_msg(int fd, dctl_cmd_t *cmd) -{ - cmd->dcmd_magic = DCTL_MAGIC; - cmd->dcmd_version = DCTL_PROTOCOL_VER; - - return dctl_send_data(fd, cmd, sizeof(dctl_cmd_t)); -} - -int dctl_read_data(int fd, void *ptr, size_t size) -{ - size_t read = 0; - size_t left = size; - ssize_t rc; - - while (left > 0) { - rc = recv(fd, (caddr_t) ptr + read, left, 0); - - /* File descriptor closed */ - if (rc == 0) - return ECONNRESET; - - if (rc == -1) { - if (errno == EINTR) - continue; - return errno; - } - - read += rc; - left -= rc; - } - - return 0; -} - -int dctl_send_data(int fd, const void *ptr, size_t size) -{ - ssize_t rc; - - do { - rc = send(fd, ptr, size, MSG_NOSIGNAL); - } while(rc == -1 && errno == EINTR); - - return rc == size ? 0 : EIO; -} - diff --git a/zfs/lib/libdmu-ctl/dctl_server.c b/zfs/lib/libdmu-ctl/dctl_server.c deleted file mode 100644 index 016278509..000000000 --- a/zfs/lib/libdmu-ctl/dctl_server.c +++ /dev/null @@ -1,476 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <stdio.h> -#include <stddef.h> -#include <stdlib.h> -#include <string.h> -#include <signal.h> -#include <limits.h> -#include <errno.h> -#include <poll.h> -#include <pthread.h> -#include <unistd.h> -#include <sys/debug.h> -#include <sys/socket.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/un.h> -#include <sys/list.h> -#include <sys/cred.h> - -#include <sys/dmu_ctl.h> -#include <sys/dmu_ctl_impl.h> - -static dctl_sock_info_t ctl_sock = { - .dsi_mtx = PTHREAD_MUTEX_INITIALIZER, - .dsi_fd = -1 -}; - -static int dctl_create_socket_common(); - -/* - * Routines from zfs_ioctl.c - */ -extern int zfs_ioctl_init(); -extern int zfs_ioctl_fini(); -extern int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, - int *rvalp); - -/* - * We can't simply put the client file descriptor in wthr_info_t because we - * have no way of accessing it from the DMU code without extensive - * modifications. - * - * Therefore each worker thread will have it's own global thread-specific - * client_fd variable. - */ -static __thread int client_fd = -1; - -int dctls_copyin(const void *src, void *dest, size_t size) -{ - dctl_cmd_t cmd; - - VERIFY(client_fd >= 0); - - cmd.dcmd_msg = DCTL_COPYIN; - cmd.u.dcmd_copy.ptr = (uintptr_t) src; - cmd.u.dcmd_copy.size = size; - - if (dctl_send_msg(client_fd, &cmd) != 0) - return EFAULT; - - if (dctl_read_data(client_fd, dest, size) != 0) - return EFAULT; - - return 0; -} - -int dctls_copyinstr(const char *from, char *to, size_t max, size_t *len) -{ - dctl_cmd_t msg; - size_t copied; - - VERIFY(client_fd >= 0); - - if (max == 0) - return ENAMETOOLONG; - if (max < 0) - return EFAULT; - - msg.dcmd_msg = DCTL_COPYINSTR; - msg.u.dcmd_copy.ptr = (uintptr_t) from; - msg.u.dcmd_copy.size = max; - - if (dctl_send_msg(client_fd, &msg) != 0) - return EFAULT; - - if (dctl_read_msg(client_fd, &msg) != 0) - return EFAULT; - - if (msg.dcmd_msg != DCTL_GEN_REPLY) - return EFAULT; - - copied = msg.u.dcmd_reply.size; - - if (copied >= max) - return EFAULT; - - if (copied > 0) - if (dctl_read_data(client_fd, to, copied) != 0) - return EFAULT; - - to[copied] = '\0'; - - if (len != NULL) - *len = copied + 1; - - return msg.u.dcmd_reply.rc; -} - -int dctls_copyout(const void *src, void *dest, size_t size) -{ - dctl_cmd_t cmd; - - VERIFY(client_fd >= 0); - - cmd.dcmd_msg = DCTL_COPYOUT; - cmd.u.dcmd_copy.ptr = (uintptr_t) dest; - cmd.u.dcmd_copy.size = size; - - if (dctl_send_msg(client_fd, &cmd) != 0) - return EFAULT; - - if (dctl_send_data(client_fd, src, size) != 0) - return EFAULT; - - return 0; -} - -int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp) -{ - dctl_cmd_t msg; - uint64_t dsize; - int error; - - VERIFY(client_fd >= 0); - - msg.dcmd_msg = DCTL_FD_READ; - msg.u.dcmd_fd_io.fd = fd; - msg.u.dcmd_fd_io.size = len; - - if ((error = dctl_send_msg(client_fd, &msg)) != 0) - return error; - - if ((error = dctl_read_msg(client_fd, &msg)) != 0) - return error; - - if (msg.dcmd_msg != DCTL_GEN_REPLY) - return EIO; - - if (msg.u.dcmd_reply.rc != 0) - return msg.u.dcmd_reply.rc; - - dsize = msg.u.dcmd_reply.size; - - if (dsize > 0) - error = dctl_read_data(client_fd, buf, dsize); - - *residp = len - dsize; - - return error; -} - -int dctls_fd_write(int fd, const void *src, ssize_t len) -{ - dctl_cmd_t msg; - int error; - - VERIFY(client_fd >= 0); - - msg.dcmd_msg = DCTL_FD_WRITE; - msg.u.dcmd_fd_io.fd = fd; - msg.u.dcmd_fd_io.size = len; - - error = dctl_send_msg(client_fd, &msg); - - if (!error) - error = dctl_send_data(client_fd, src, len); - - if (!error) - error = dctl_read_msg(client_fd, &msg); - - if (error) - return error; - - if (msg.dcmd_msg != DCTL_GEN_REPLY) - return EIO; - - if (msg.u.dcmd_reply.rc != 0) - return msg.u.dcmd_reply.rc; - - /* - * We have to do this because the original upstream code - * does not check if residp == len. - */ - if (msg.u.dcmd_reply.size != len) - return EIO; - - return 0; -} - -/* Handle a new connection */ -static void dctl_handle_conn(int sock_fd) -{ - dctl_cmd_t cmd; - dev_t dev = { 0 }; - int rc; - - client_fd = sock_fd; - - while (dctl_read_msg(sock_fd, &cmd) == 0) { - if (cmd.dcmd_msg != DCTL_IOCTL) { - fprintf(stderr, "%s(): unexpected message type.\n", - __func__); - break; - } - - rc = zfsdev_ioctl(dev, cmd.u.dcmd_ioctl.cmd, - (intptr_t) cmd.u.dcmd_ioctl.arg, 0, NULL, NULL); - - cmd.dcmd_msg = DCTL_IOCTL_REPLY; - cmd.u.dcmd_reply.rc = rc; - - if (dctl_send_msg(sock_fd, &cmd) != 0) - break; - } - close(sock_fd); - - client_fd = -1; -} - -/* Main worker thread loop */ -static void *dctl_thread(void *arg) -{ - wthr_info_t *thr = arg; - struct pollfd fds[1]; - - fds[0].events = POLLIN; - - pthread_mutex_lock(&ctl_sock.dsi_mtx); - - while (!thr->wthr_exit) { - /* Clean-up dead threads */ - dctl_thr_join(); - - /* The file descriptor might change in the thread lifetime */ - fds[0].fd = ctl_sock.dsi_fd; - - /* Poll socket with 1-second timeout */ - int rc = poll(fds, 1, 1000); - if (rc == 0 || (rc == -1 && errno == EINTR)) - continue; - - /* Recheck the exit flag */ - if (thr->wthr_exit) - break; - - if (rc == -1) { - /* Unknown error, let's try to recreate the socket */ - close(ctl_sock.dsi_fd); - ctl_sock.dsi_fd = -1; - - if (dctl_create_socket_common() != 0) - break; - - continue; - } - ASSERT(rc == 1); - - short rev = fds[0].revents; - if (rev == 0) - continue; - ASSERT(rev == POLLIN); - - /* - * At this point there should be a connection ready to be - * accepted. - */ - int client_fd = accept(ctl_sock.dsi_fd, NULL, NULL); - /* Many possible errors here, we'll just retry */ - if (client_fd == -1) - continue; - - /* - * Now lets handle the request. This can take a very - * long time (hours even), so we'll let other threads - * handle new connections. - */ - pthread_mutex_unlock(&ctl_sock.dsi_mtx); - - dctl_thr_rebalance(thr, B_FALSE); - dctl_handle_conn(client_fd); - dctl_thr_rebalance(thr, B_TRUE); - - pthread_mutex_lock(&ctl_sock.dsi_mtx); - } - pthread_mutex_unlock(&ctl_sock.dsi_mtx); - - dctl_thr_die(thr); - - return NULL; -} - -static int dctl_create_socket_common() -{ - dctl_sock_info_t *s = &ctl_sock; - size_t size; - int error; - - ASSERT(s->dsi_fd == -1); - - /* - * Unlink old socket, in case it exists. - * We don't care about errors here. - */ - unlink(s->dsi_path); - - /* Create the socket */ - s->dsi_fd = socket(PF_UNIX, SOCK_STREAM, 0); - if (s->dsi_fd == -1) { - error = errno; - perror("socket"); - return error; - } - - s->dsi_addr.sun_family = AF_UNIX; - - size = sizeof(s->dsi_addr.sun_path) - 1; - strncpy(s->dsi_addr.sun_path, s->dsi_path, size); - - s->dsi_addr.sun_path[size] = '\0'; - - if (bind(s->dsi_fd, (struct sockaddr *) &s->dsi_addr, - sizeof(s->dsi_addr)) != 0) { - error = errno; - perror("bind"); - return error; - } - - if (listen(s->dsi_fd, LISTEN_BACKLOG) != 0) { - error = errno; - perror("listen"); - unlink(s->dsi_path); - return error; - } - - return 0; -} - -static int dctl_create_socket(const char *cfg_dir) -{ - int error; - dctl_sock_info_t *s = &ctl_sock; - - ASSERT(s->dsi_path == NULL); - ASSERT(s->dsi_fd == -1); - - int pathsize = strlen(cfg_dir) + strlen(SOCKNAME) + 2; - if (pathsize > sizeof(s->dsi_addr.sun_path)) - return ENAMETOOLONG; - - s->dsi_path = malloc(pathsize); - if (s->dsi_path == NULL) - return ENOMEM; - - strcpy(s->dsi_path, cfg_dir); - strcat(s->dsi_path, "/" SOCKNAME); - - /* - * For convenience, create the directory in case it doesn't exist. - * We don't care about errors here. - */ - mkdir(cfg_dir, 0770); - - error = dctl_create_socket_common(); - - if (error) { - free(s->dsi_path); - - if (s->dsi_fd != -1) { - close(s->dsi_fd); - s->dsi_fd = -1; - } - } - - return error; -} - -static void dctl_destroy_socket() -{ - dctl_sock_info_t *s = &ctl_sock; - - ASSERT(s->dsi_path != NULL); - ASSERT(s->dsi_fd != -1); - - close(s->dsi_fd); - s->dsi_fd = -1; - - unlink(s->dsi_path); - free(s->dsi_path); -} - -/* - * Initialize the DMU userspace control interface. - * This should be called after kernel_init(). - * - * Note that only very rarely we have more than a couple of simultaneous - * lzfs/lzpool connections. Since the thread pool grows automatically when all - * threads are busy, a good value for min_thr and max_free_thr is 2. - */ -int dctl_server_init(const char *cfg_dir, int min_thr, int max_free_thr) -{ - int error; - - ASSERT(min_thr > 0); - ASSERT(max_free_thr >= min_thr); - - error = zfs_ioctl_init(); - if (error) - return error; - - error = dctl_create_socket(cfg_dir); - if (error) { - (void) zfs_ioctl_fini(); - return error; - } - - error = dctl_thr_pool_create(min_thr, max_free_thr, dctl_thread); - if (error) { - (void) zfs_ioctl_fini(); - dctl_destroy_socket(); - return error; - } - - return 0; -} - -/* - * Terminate control interface. - * This should be called after closing all objsets, but before calling - * kernel_fini(). - * May return EBUSY if the SPA is busy. - * - * Thread pool destruction can take a while due to poll() - * timeout or due to a thread being busy (e.g. a backup is being taken). - */ -int dctl_server_fini() -{ - dctl_thr_pool_stop(); - dctl_destroy_socket(); - - return zfs_ioctl_fini(); -} diff --git a/zfs/lib/libdmu-ctl/dctl_thrpool.c b/zfs/lib/libdmu-ctl/dctl_thrpool.c deleted file mode 100644 index 7b2f9b4c2..000000000 --- a/zfs/lib/libdmu-ctl/dctl_thrpool.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <stdlib.h> -#include <stddef.h> -#include <time.h> -#include <pthread.h> -#include <errno.h> -#include <sys/list.h> -#include <sys/debug.h> - -#include <sys/dmu_ctl.h> -#include <sys/dmu_ctl_impl.h> - -static dctl_thr_info_t thr_pool = { - .dti_mtx = PTHREAD_MUTEX_INITIALIZER -}; - -/* - * Create n threads. - * Callers must acquire thr_pool.dti_mtx first. - */ -static int dctl_thr_create(int n) -{ - dctl_thr_info_t *p = &thr_pool; - int error; - - for (int i = 0; i < n; i++) { - wthr_info_t *thr = malloc(sizeof(wthr_info_t)); - if (thr == NULL) - return ENOMEM; - - thr->wthr_exit = B_FALSE; - thr->wthr_free = B_TRUE; - - error = pthread_create(&thr->wthr_id, NULL, p->dti_thr_func, - thr); - if (error) { - free(thr); - return error; - } - - p->dti_free++; - - list_insert_tail(&p->dti_list, thr); - } - return 0; -} - -/* - * Mark the thread as dead. - * Must be called right before exiting the main thread function. - */ -void dctl_thr_die(wthr_info_t *thr) -{ - dctl_thr_info_t *p = &thr_pool; - - thr->wthr_exit = B_TRUE; - dctl_thr_rebalance(thr, B_FALSE); - - pthread_mutex_lock(&p->dti_mtx); - - list_remove(&p->dti_list, thr); - list_insert_tail(&p->dti_join_list, thr); - - pthread_mutex_unlock(&p->dti_mtx); -} - -/* - * Clean-up dead threads. - */ -void dctl_thr_join() -{ - dctl_thr_info_t *p = &thr_pool; - wthr_info_t *thr; - - pthread_mutex_lock(&p->dti_mtx); - - while ((thr = list_head(&p->dti_join_list))) { - list_remove(&p->dti_join_list, thr); - - ASSERT(!pthread_equal(thr->wthr_id, pthread_self())); - - /* - * This should not block because all the threads - * on this list should have died already. - * - * pthread_join() can only return an error if - * we made a programming mistake. - */ - VERIFY(pthread_join(thr->wthr_id, NULL) == 0); - - ASSERT(thr->wthr_exit); - ASSERT(!thr->wthr_free); - - free(thr); - } - - pthread_mutex_unlock(&p->dti_mtx); -} - -/* - * Adjust the number of free threads in the pool and the thread status. - * - * Callers must acquire thr_pool.dti_mtx first. - */ -static void dctl_thr_adjust_free(wthr_info_t *thr, boolean_t set_free) -{ - dctl_thr_info_t *p = &thr_pool; - - ASSERT(p->dti_free >= 0); - - if (!thr->wthr_free && set_free) - p->dti_free++; - else if (thr->wthr_free && !set_free) - p->dti_free--; - - ASSERT(p->dti_free >= 0); - - thr->wthr_free = set_free; -} - -/* - * Rebalance threads. Also adjusts the free status of the thread. - * Will set the thread exit flag if the number of free threads is above - * the limit. - */ -void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free) -{ - dctl_thr_info_t *p = &thr_pool; - - pthread_mutex_lock(&p->dti_mtx); - - if (p->dti_exit || p->dti_free > p->dti_max_free) - thr->wthr_exit = B_TRUE; - - if (thr->wthr_exit) - set_free = B_FALSE; - - dctl_thr_adjust_free(thr, set_free); - - if (!p->dti_exit && p->dti_free == 0) - dctl_thr_create(1); - - pthread_mutex_unlock(&p->dti_mtx); -} - -/* - * Stop the thread pool. - * - * This can take a while since it actually waits for all threads to exit. - */ -void dctl_thr_pool_stop() -{ - dctl_thr_info_t *p = &thr_pool; - wthr_info_t *thr; - struct timespec ts; - - pthread_mutex_lock(&p->dti_mtx); - - ASSERT(!p->dti_exit); - p->dti_exit = B_TRUE; - - /* Let's flag the threads first */ - thr = list_head(&p->dti_list); - while (thr != NULL) { - thr->wthr_exit = B_TRUE; - dctl_thr_adjust_free(thr, B_FALSE); - - thr = list_next(&p->dti_list, thr); - } - - pthread_mutex_unlock(&p->dti_mtx); - - /* Now let's wait for them to exit */ - ts.tv_sec = 0; - ts.tv_nsec = 50000000; /* 50ms */ - do { - nanosleep(&ts, NULL); - - pthread_mutex_lock(&p->dti_mtx); - thr = list_head(&p->dti_list); - pthread_mutex_unlock(&p->dti_mtx); - - dctl_thr_join(); - } while(thr != NULL); - - ASSERT(p->dti_free == 0); - - ASSERT(list_is_empty(&p->dti_list)); - ASSERT(list_is_empty(&p->dti_join_list)); - - list_destroy(&p->dti_list); - list_destroy(&p->dti_join_list); -} - -/* - * Create thread pool. - * - * If at least one thread creation fails, it will stop all previous - * threads and return a non-zero value. - */ -int dctl_thr_pool_create(int min_thr, int max_free_thr, - thr_func_t *thr_func) -{ - int error; - dctl_thr_info_t *p = &thr_pool; - - ASSERT(p->dti_free == 0); - - /* Initialize global variables */ - p->dti_min = min_thr; - p->dti_max_free = max_free_thr; - p->dti_exit = B_FALSE; - p->dti_thr_func = thr_func; - - list_create(&p->dti_list, sizeof(wthr_info_t), offsetof(wthr_info_t, - wthr_node)); - list_create(&p->dti_join_list, sizeof(wthr_info_t), - offsetof(wthr_info_t, wthr_node)); - - pthread_mutex_lock(&p->dti_mtx); - error = dctl_thr_create(min_thr); - pthread_mutex_unlock(&p->dti_mtx); - - if (error) - dctl_thr_pool_stop(); - - return error; -} diff --git a/zfs/lib/libdmu-ctl/dmu_send.c b/zfs/lib/libdmu-ctl/dmu_send.c deleted file mode 100644 index 1c72f9507..000000000 --- a/zfs/lib/libdmu-ctl/dmu_send.c +++ /dev/null @@ -1,1249 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)dmu_send.c 1.14 08/04/27 SMI" - -#include <sys/dmu.h> -#include <sys/dmu_impl.h> -#include <sys/dmu_tx.h> -#include <sys/dbuf.h> -#include <sys/dnode.h> -#include <sys/zfs_context.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_traverse.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_pool.h> -#include <sys/dsl_synctask.h> -#include <sys/zfs_ioctl.h> -#include <sys/zap.h> -#include <sys/zio_checksum.h> - -static char *dmu_recv_tag = "dmu_recv_tag"; - -struct backuparg { - dmu_replay_record_t *drr; - vnode_t *vp; - offset_t *off; - objset_t *os; - zio_cksum_t zc; - int err; -}; - -static int -dump_bytes(struct backuparg *ba, void *buf, int len) -{ - ssize_t resid; /* have to get resid to get detailed errno */ - ASSERT3U(len % 8, ==, 0); - - fletcher_4_incremental_native(buf, len, &ba->zc); - ba->err = vn_rdwr(UIO_WRITE, ba->vp, - (caddr_t)buf, len, - 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); - *ba->off += len; - return (ba->err); -} - -static int -dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, - uint64_t length) -{ - /* write a FREE record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREE; - ba->drr->drr_u.drr_free.drr_object = object; - ba->drr->drr_u.drr_free.drr_offset = offset; - ba->drr->drr_u.drr_free.drr_length = length; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - return (0); -} - -static int -dump_data(struct backuparg *ba, dmu_object_type_t type, - uint64_t object, uint64_t offset, int blksz, void *data) -{ - /* write a DATA record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_WRITE; - ba->drr->drr_u.drr_write.drr_object = object; - ba->drr->drr_u.drr_write.drr_type = type; - ba->drr->drr_u.drr_write.drr_offset = offset; - ba->drr->drr_u.drr_write.drr_length = blksz; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - if (dump_bytes(ba, data, blksz)) - return (EINTR); - return (0); -} - -static int -dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) -{ - /* write a FREEOBJECTS record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREEOBJECTS; - ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; - ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - return (0); -} - -static int -dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) -{ - if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(ba, object, 1)); - - /* write an OBJECT record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_OBJECT; - ba->drr->drr_u.drr_object.drr_object = object; - ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; - ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; - ba->drr->drr_u.drr_object.drr_blksz = - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; - ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; - ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); - - if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) - return (EINTR); - - /* free anything past the end of the file */ - if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) - return (EINTR); - if (ba->err) - return (EINTR); - return (0); -} - -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) - -static int -backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) -{ - struct backuparg *ba = arg; - uint64_t object = bc->bc_bookmark.zb_object; - int level = bc->bc_bookmark.zb_level; - uint64_t blkid = bc->bc_bookmark.zb_blkid; - blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; - dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; - void *data = bc->bc_data; - int err = 0; - - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (EINTR); - - ASSERT(data || bp == NULL); - - if (bp == NULL && object == 0) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); - } else if (bp == NULL) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - err = dump_free(ba, object, blkid * span, span); - } else if (data && level == 0 && type == DMU_OT_DNODE) { - dnode_phys_t *blk = data; - int i; - int blksz = BP_GET_LSIZE(bp); - - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = - (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(ba, dnobj, blk+i); - if (err) - break; - } - } else if (level == 0 && - type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { - int blksz = BP_GET_LSIZE(bp); - if (data == NULL) { - uint32_t aflags = ARC_WAIT; - arc_buf_t *abuf; - zbookmark_t zb; - - zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; - zb.zb_object = object; - zb.zb_level = level; - zb.zb_blkid = blkid; - (void) arc_read(NULL, spa, bp, - dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); - - if (abuf) { - err = dump_data(ba, type, object, blkid * blksz, - blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); - } - } else { - err = dump_data(ba, type, object, blkid * blksz, - blksz, data); - } - } - - ASSERT(err == 0 || err == EINTR); - return (err); -} - -int -dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - vnode_t *vp, offset_t *off) -{ - dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; - dmu_replay_record_t *drr; - struct backuparg ba; - int err; - uint64_t fromtxg = 0; - - /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) - return (EINVAL); - - /* fromsnap must be an earlier snapshot from the same fs as tosnap */ - if (fromds && (ds->ds_dir != fromds->ds_dir || - fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) - return (EXDEV); - - if (fromorigin) { - if (fromsnap) - return (EINVAL); - - if (ds->ds_dir->dd_phys->dd_origin_obj != NULL) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_open_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, NULL, - DS_MODE_NONE, FTAG, &fromds); - rw_exit(&dp->dp_config_rwlock); - if (err) - return (err); - } else { - fromorigin = B_FALSE; - } - } - - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; - drr->drr_u.drr_begin.drr_creation_time = - ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; - if (fromorigin) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - - if (fromds) - drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - - if (fromds) - fromtxg = fromds->ds_phys->ds_creation_txg; - if (fromorigin) - dsl_dataset_close(fromds, DS_MODE_NONE, FTAG); - - ba.drr = drr; - ba.vp = vp; - ba.os = tosnap; - ba.off = off; - ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); - - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); - } - - err = traverse_dsl_dataset(ds, fromtxg, - ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, - backup_cb, &ba); - - if (err) { - if (err == EINTR && ba.err) - err = ba.err; - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (err); - } - - bzero(drr, sizeof (dmu_replay_record_t)); - drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = ba.zc; - - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); - } - - kmem_free(drr, sizeof (dmu_replay_record_t)); - - return (0); -} - -struct recvbeginsyncarg { - const char *tofs; - const char *tosnap; - dsl_dataset_t *origin; - uint64_t fromguid; - dmu_objset_type_t type; - void *tag; - boolean_t force; - uint64_t dsflags; - char clonelastname[MAXNAMELEN]; - dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ -}; - -static dsl_dataset_t * -recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, - cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - - VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL, - DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds)); - - if (type != DMU_OST_NONE) { - (void) dmu_objset_create_impl(dp->dp_spa, - ds, &ds->ds_phys->ds_bp, type, tx); - } - - spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", - ds->ds_phys->ds_dir_obj); - - return (ds); -} - -/* ARGSUSED */ -static int -recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val; - int err; - - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); - - if (err != ENOENT) - return (err ? err : EEXIST); - - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); - if (rbsa->origin->ds_phys->ds_num_children == 0) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); - } - - return (0); -} - -static void -recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - uint64_t dsobj; - uint64_t flags = DS_FLAG_INCONSISTENT; - - flags |= rbsa->dsflags; - - dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, cr, tx); - - rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, - rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); -} - -static int -recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - int err; - - /* must be a head ds */ - if (ds->ds_phys->ds_next_snap_obj != 0) - return (EINVAL); - - /* must not be a clone ds */ - if (ds->ds_prev != NULL) - return (EINVAL); - - err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); - if (err) - return (err); - - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) - return (EXDEV); - if (rbsa->origin->ds_phys->ds_num_children == 0) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); - } - - return (0); -} - -static void -recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_dir_t *dd = ds->ds_dir; - uint64_t dsobj; - uint64_t flags = DS_FLAG_INCONSISTENT; - - flags |= rbsa->dsflags; - - /* - * NB: caller must provide an extra hold on the dsl_dir_t, so it - * won't go away when dsl_dataset_destroy_sync() closes the - * dataset. - */ - dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); - - dsobj = dsl_dataset_create_sync_impl(dd, rbsa->origin, flags, tx); - - rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, - rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); -} - -/* ARGSUSED */ -static int -recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - int err; - uint64_t val; - - /* must not have any changes since most recent snapshot */ - if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) - return (ETXTBSY); - - /* must already be a snapshot of this fs */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - return (ENODEV); - - /* most recent snapshot must match fromguid */ - if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); - - /* temporary clone name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, - rbsa->clonelastname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - - /* new snapshot name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - return (0); -} - -/* ARGSUSED */ -static void -recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ohds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_pool_t *dp = ohds->ds_dir->dd_pool; - dsl_dataset_t *ods, *cds; - uint64_t dsobj; - uint64_t flags = DS_FLAG_INCONSISTENT; - - flags |= rbsa->dsflags; - - /* create the temporary clone */ - VERIFY(0 == dsl_dataset_open_obj(dp, ohds->ds_phys->ds_prev_snap_obj, - NULL, DS_MODE_STANDARD, FTAG, &ods)); - dsobj = dsl_dataset_create_sync(ohds->ds_dir, - rbsa->clonelastname, ods, flags, cr, tx); - dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG); - - /* open the temporary clone */ - VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL, - DS_MODE_EXCLUSIVE, dmu_recv_tag, &cds)); - - /* copy the refquota from the target fs to the clone */ - if (ohds->ds_quota > 0) - dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); - - rbsa->ds = cds; - - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", - cds->ds_phys->ds_dir_obj); -} - -/* ARGSUSED */ -static void -recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", - ds->ds_phys->ds_dir_obj); -} - -/* - * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() - * succeeds; otherwise we will leak the holds on the datasets. - */ -int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) -{ - int err = 0; - boolean_t byteswap; - struct recvbeginsyncarg rbsa; - uint64_t version; - int flags; - dsl_dataset_t *ds; - - if (drrb->drr_magic == DMU_BACKUP_MAGIC) - byteswap = FALSE; - else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - byteswap = TRUE; - else - return (EINVAL); - - rbsa.tofs = tofs; - rbsa.tosnap = tosnap; - rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; - rbsa.fromguid = drrb->drr_fromguid; - rbsa.type = drrb->drr_type; - rbsa.tag = FTAG; - rbsa.dsflags = 0; - version = drrb->drr_version; - flags = drrb->drr_flags; - - if (byteswap) { - rbsa.type = BSWAP_32(rbsa.type); - rbsa.fromguid = BSWAP_64(rbsa.fromguid); - version = BSWAP_64(version); - flags = BSWAP_32(flags); - } - - if (version != DMU_BACKUP_STREAM_VERSION || - rbsa.type >= DMU_OST_NUMTYPES || - ((flags & DRR_FLAG_CLONE) && origin == NULL)) - return (EINVAL); - - if (flags & DRR_FLAG_CI_DATA) - rbsa.dsflags = DS_FLAG_CI_DATASET; - - bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drrb = drrb; - drc->drc_tosnap = tosnap; - drc->drc_force = force; - - /* - * Process the begin in syncing context. - */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { - /* offline incremental receive */ - err = dsl_dataset_open(tofs, - DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds); - if (err) - return (err); - - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - rbsa.fromguid) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, - dmu_recv_tag); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); - } - rbsa.force = B_FALSE; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_offline_incremental_sync, - ds, &rbsa, 1); - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, dmu_recv_tag); - return (err); - } - drc->drc_logical_ds = drc->drc_real_ds = ds; - } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { - /* online incremental receive */ - - /* tmp clone name is: tofs/%tosnap" */ - (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), - "%%%s", tosnap); - - /* open the dataset we are logically receiving into */ - err = dsl_dataset_open(tofs, - DS_MODE_STANDARD, dmu_recv_tag, &ds); - if (err) - return (err); - - rbsa.force = force; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_online_incremental_sync, ds, &rbsa, 5); - if (err) { - dsl_dataset_close(ds, DS_MODE_STANDARD, dmu_recv_tag); - return (err); - } - drc->drc_logical_ds = ds; - drc->drc_real_ds = rbsa.ds; - } else { - /* create new fs -- full backup or clone */ - dsl_dir_t *dd = NULL; - const char *tail; - - err = dsl_dir_open(tofs, FTAG, &dd, &tail); - if (err) - return (err); - if (tail == NULL) { - if (!force) { - dsl_dir_close(dd, FTAG); - return (EEXIST); - } - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, - FTAG, &ds); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (err) { - dsl_dir_close(dd, FTAG); - return (err); - } - - err = dsl_sync_task_do(dd->dd_pool, - recv_full_existing_check, - recv_full_existing_sync, ds, &rbsa, 5); - /* if successful, sync task closes the ds for us */ - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - err = dsl_sync_task_do(dd->dd_pool, recv_full_check, - recv_full_sync, dd, &rbsa, 5); - if (err) - return (err); - } - dsl_dir_close(dd, FTAG); - if (err) - return (err); - drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; - drc->drc_newfs = B_TRUE; - } - - /* downgrade our hold on the ds from EXCLUSIVE to PRIMARY */ - dsl_dataset_downgrade(drc->drc_real_ds, - DS_MODE_EXCLUSIVE, DS_MODE_PRIMARY); - - return (0); -} - -struct restorearg { - int err; - int byteswap; - vnode_t *vp; - char *buf; - uint64_t voff; - int bufsize; /* amount of memory allocated for buf */ - zio_cksum_t cksum; -}; - -static void * -restore_read(struct restorearg *ra, int len) -{ - void *rv; - int done = 0; - - /* some things will require 8-byte alignment, so everything must */ - ASSERT3U(len % 8, ==, 0); - - while (done < len) { - ssize_t resid; - - ra->err = vn_rdwr(UIO_READ, ra->vp, - (caddr_t)ra->buf + done, len - done, - ra->voff, UIO_SYSSPACE, FAPPEND, - RLIM64_INFINITY, CRED(), &resid); - - if (resid == len - done) - ra->err = EINVAL; - ra->voff += len - done - resid; - done = len - resid; - if (ra->err) - return (NULL); - } - - ASSERT3U(done, ==, len); - rv = ra->buf; - if (ra->byteswap) - fletcher_4_incremental_byteswap(rv, len, &ra->cksum); - else - fletcher_4_incremental_native(rv, len, &ra->cksum); - return (rv); -} - -static void -backup_byteswap(dmu_replay_record_t *drr) -{ -#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) -#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) - drr->drr_type = BSWAP_32(drr->drr_type); - drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); - switch (drr->drr_type) { - case DRR_BEGIN: - DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_version); - DO64(drr_begin.drr_creation_time); - DO32(drr_begin.drr_type); - DO32(drr_begin.drr_flags); - DO64(drr_begin.drr_toguid); - DO64(drr_begin.drr_fromguid); - break; - case DRR_OBJECT: - DO64(drr_object.drr_object); - /* DO64(drr_object.drr_allocation_txg); */ - DO32(drr_object.drr_type); - DO32(drr_object.drr_bonustype); - DO32(drr_object.drr_blksz); - DO32(drr_object.drr_bonuslen); - break; - case DRR_FREEOBJECTS: - DO64(drr_freeobjects.drr_firstobj); - DO64(drr_freeobjects.drr_numobjs); - break; - case DRR_WRITE: - DO64(drr_write.drr_object); - DO32(drr_write.drr_type); - DO64(drr_write.drr_offset); - DO64(drr_write.drr_length); - break; - case DRR_FREE: - DO64(drr_free.drr_object); - DO64(drr_free.drr_offset); - DO64(drr_free.drr_length); - break; - case DRR_END: - DO64(drr_end.drr_checksum.zc_word[0]); - DO64(drr_end.drr_checksum.zc_word[1]); - DO64(drr_end.drr_checksum.zc_word[2]); - DO64(drr_end.drr_checksum.zc_word[3]); - break; - } -#undef DO64 -#undef DO32 -} - -static int -restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) -{ - int err; - dmu_tx_t *tx; - - err = dmu_object_info(os, drro->drr_object, NULL); - - if (err != 0 && err != ENOENT) - return (EINVAL); - - if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || - drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || - drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || - P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || - drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > SPA_MAXBLOCKSIZE || - drro->drr_bonuslen > DN_MAX_BONUSLEN) { - return (EINVAL); - } - - tx = dmu_tx_create(os); - - if (err == ENOENT) { - /* currently free, want to be allocated */ - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_object_claim(os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); - } else { - /* currently allocated, want to be allocated */ - dmu_tx_hold_bonus(tx, drro->drr_object); - /* - * We may change blocksize, so need to - * hold_write - */ - dmu_tx_hold_write(tx, drro->drr_object, 0, 1); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - err = dmu_object_reclaim(os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); - } - if (err) { - dmu_tx_commit(tx); - return (EINVAL); - } - - dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); - dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); - - if (drro->drr_bonuslen) { - dmu_buf_t *db; - void *data; - VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - - ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); - if (data == NULL) { - dmu_tx_commit(tx); - return (ra->err); - } - bcopy(data, db->db_data, drro->drr_bonuslen); - if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, - drro->drr_bonuslen); - } - dmu_buf_rele(db, FTAG); - } - dmu_tx_commit(tx); - return (0); -} - -/* ARGSUSED */ -static int -restore_freeobjects(struct restorearg *ra, objset_t *os, - struct drr_freeobjects *drrfo) -{ - uint64_t obj; - - if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) - return (EINVAL); - - for (obj = drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx; - int err; - - if (dmu_object_info(os, obj, NULL) != 0) - continue; - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_object_free(os, obj, tx); - dmu_tx_commit(tx); - if (err && err != ENOENT) - return (EINVAL); - } - return (0); -} - -static int -restore_write(struct restorearg *ra, objset_t *os, - struct drr_write *drrw) -{ - dmu_tx_t *tx; - void *data; - int err; - - if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) - return (EINVAL); - - data = restore_read(ra, drrw->drr_length); - if (data == NULL) - return (ra->err); - - if (dmu_object_info(os, drrw->drr_object, NULL) != 0) - return (EINVAL); - - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); - dmu_write(os, drrw->drr_object, - drrw->drr_offset, drrw->drr_length, data, tx); - dmu_tx_commit(tx); - return (0); -} - -/* ARGSUSED */ -static int -restore_free(struct restorearg *ra, objset_t *os, - struct drr_free *drrf) -{ - dmu_tx_t *tx; - int err; - - if (drrf->drr_length != -1ULL && - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) - return (EINVAL); - - if (dmu_object_info(os, drrf->drr_object, NULL) != 0) - return (EINVAL); - - tx = dmu_tx_create(os); - - dmu_tx_hold_free(tx, drrf->drr_object, - drrf->drr_offset, drrf->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_free_range(os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length, tx); - dmu_tx_commit(tx); - return (err); -} - -void -dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) -{ - if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { - /* - * online incremental or new fs: destroy the fs (which - * may be a clone) that we created - */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); - if (drc->drc_real_ds != drc->drc_logical_ds) { - dsl_dataset_close(drc->drc_logical_ds, - DS_MODE_STANDARD, dmu_recv_tag); - } - } else { - /* - * offline incremental: rollback to most recent snapshot. - */ - int lmode = DS_MODE_PRIMARY; - if (dsl_dataset_tryupgrade(drc->drc_real_ds, - DS_MODE_PRIMARY, DS_MODE_EXCLUSIVE)) { - lmode = DS_MODE_EXCLUSIVE; - (void) dsl_dataset_rollback(drc->drc_real_ds, - DMU_OST_NONE); - } - dsl_dataset_close(drc->drc_real_ds, lmode, FTAG); - } -} - -/* - * NB: callers *must* call dmu_recv_end() if this succeeds. - */ -int -dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) -{ - struct restorearg ra = { 0 }; - dmu_replay_record_t *drr; - objset_t *os; - zio_cksum_t pcksum; - - if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - ra.byteswap = TRUE; - - { - /* compute checksum of drr_begin record */ - dmu_replay_record_t *drr; - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } else { - fletcher_4_incremental_native(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - } - - if (ra.byteswap) { - struct drr_begin *drrb = drc->drc_drrb; - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_version = BSWAP_64(drrb->drr_version); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - - ra.vp = vp; - ra.voff = *voffp; - ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - - /* these were verified in dmu_recv_begin */ - ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); - ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); - - /* - * Open the objset we are modifying. - */ - VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); - - ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); - - /* - * Read records and process them. - */ - pcksum = ra.cksum; - while (ra.err == 0 && - NULL != (drr = restore_read(&ra, sizeof (*drr)))) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - ra.err = EINTR; - goto out; - } - - if (ra.byteswap) - backup_byteswap(drr); - - switch (drr->drr_type) { - case DRR_OBJECT: - { - /* - * We need to make a copy of the record header, - * because restore_{object,write} may need to - * restore_read(), which will invalidate drr. - */ - struct drr_object drro = drr->drr_u.drr_object; - ra.err = restore_object(&ra, os, &drro); - break; - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects drrfo = - drr->drr_u.drr_freeobjects; - ra.err = restore_freeobjects(&ra, os, &drrfo); - break; - } - case DRR_WRITE: - { - struct drr_write drrw = drr->drr_u.drr_write; - ra.err = restore_write(&ra, os, &drrw); - break; - } - case DRR_FREE: - { - struct drr_free drrf = drr->drr_u.drr_free; - ra.err = restore_free(&ra, os, &drrf); - break; - } - case DRR_END: - { - struct drr_end drre = drr->drr_u.drr_end; - /* - * We compare against the *previous* checksum - * value, because the stored checksum is of - * everything before the DRR_END record. - */ - if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) - ra.err = ECKSUM; - goto out; - } - default: - ra.err = EINVAL; - goto out; - } - pcksum = ra.cksum; - } - ASSERT(ra.err != 0); - -out: - dmu_objset_close(os); - - if (ra.err != 0) { - /* - * rollback or destroy what we created, so we don't - * leave it in the restoring state. - */ - txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - dmu_recv_abort_cleanup(drc); - } - - kmem_free(ra.buf, ra.bufsize); - *voffp = ra.voff; - return (ra.err); -} - -struct recvendsyncarg { - char *tosnap; - uint64_t creation_time; - uint64_t toguid; -}; - -static int -recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; - - return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); -} - -static void -recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; - - dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); - - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; - ds->ds_prev->ds_phys->ds_guid = resa->toguid; - ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -} - -int -dmu_recv_end(dmu_recv_cookie_t *drc) -{ - int err = 0; - int lmode; - - /* - * XXX hack; seems the ds is still dirty and - * dsl_pool_zil_clean() expects it to have a ds_user_ptr (and - * zil), but clone_swap() can close it. - */ - txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - - if (dsl_dataset_tryupgrade(drc->drc_real_ds, - DS_MODE_PRIMARY, DS_MODE_EXCLUSIVE)) { - lmode = DS_MODE_EXCLUSIVE; - } else { - dmu_recv_abort_cleanup(drc); - return (EBUSY); - } - - if (drc->drc_logical_ds != drc->drc_real_ds) { - if (err == 0 && dsl_dataset_tryupgrade(drc->drc_logical_ds, - DS_MODE_STANDARD, DS_MODE_EXCLUSIVE)) { - lmode = DS_MODE_EXCLUSIVE; - err = dsl_dataset_clone_swap(drc->drc_real_ds, - drc->drc_logical_ds, drc->drc_force); - } else { - lmode = DS_MODE_STANDARD; - err = EBUSY; - } - } - - if (err == 0) { - struct recvendsyncarg resa; - - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; - - err = dsl_sync_task_do(drc->drc_real_ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, - drc->drc_logical_ds, &resa, 3); - if (err) { - if (drc->drc_newfs) { - ASSERT(drc->drc_logical_ds == drc->drc_real_ds); - (void) dsl_dataset_destroy(drc->drc_real_ds, - dmu_recv_tag); - return (err); - } else { - (void) dsl_dataset_rollback(drc->drc_logical_ds, - DMU_OST_NONE); - } - } - } - - if (drc->drc_logical_ds != drc->drc_real_ds) { - /* dsl_dataset_destroy() will close the ds */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); - } - /* close the hold from dmu_recv_begin */ - dsl_dataset_close(drc->drc_logical_ds, lmode, dmu_recv_tag); - return (err); -} diff --git a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h deleted file mode 100644 index c2044ba27..000000000 --- a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_CTL_H -#define _SYS_DMU_CTL_H - -#include <sys/types.h> - -/* Default directory where the clients search for sockets to connect */ -#define DMU_CTL_DEFAULT_DIR "/var/run/zfs/udmu" - -/* - * These functions are called by the server process. - * - * kernel_init() must be called before dctl_server_init(). - * kernel_fini() must not be called before dctl_server_fini(). - * - * All objsets must be closed and object references be released before calling - * dctl_server_fini(), otherwise it will return EBUSY. - * - * Note: On Solaris, it is highly recommended to either catch or ignore the - * SIGPIPE signal, otherwise the server process will die if the client is - * killed. - */ -int dctl_server_init(const char *cfg_dir, int min_threads, - int max_free_threads); -int dctl_server_fini(); - -/* - * The following functions are called by the DMU from the server process context - * (in the worker threads). - */ -int dctls_copyin(const void *src, void *dest, size_t size); -int dctls_copyinstr(const char *from, char *to, size_t max, - size_t *len); -int dctls_copyout(const void *src, void *dest, size_t size); -int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp); -int dctls_fd_write(int fd, const void *src, ssize_t len); - -/* - * These functions are called by the client process (libzfs). - */ -int dctlc_connect(const char *dir, boolean_t check_subdirs); -void dctlc_disconnect(int fd); - -int dctlc_ioctl(int fd, int32_t request, void *arg); - -#endif diff --git a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h deleted file mode 100644 index 6b4a564b3..000000000 --- a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DMU_CTL_IMPL_H -#define _SYS_DMU_CTL_IMPL_H - -#include <sys/list.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/un.h> -#include <pthread.h> - -#define SOCKNAME "dmu_socket" - -#define DCTL_PROTOCOL_VER 1 -#define DCTL_MAGIC 0xdc71b1070c01dc71ll - -/* Message types */ -enum { - DCTL_IOCTL, - DCTL_IOCTL_REPLY, - DCTL_COPYIN, - DCTL_COPYINSTR, - DCTL_COPYOUT, - DCTL_FD_READ, - DCTL_FD_WRITE, - DCTL_GEN_REPLY /* generic reply */ -}; - -/* On-the-wire message */ -typedef struct dctl_cmd { - uint64_t dcmd_magic; - int8_t dcmd_version; - int8_t dcmd_msg; - uint8_t dcmd_pad[6]; - union { - struct dcmd_ioctl { - uint64_t arg; - int32_t cmd; - uint8_t pad[4]; - } dcmd_ioctl; - - struct dcmd_copy_req { - uint64_t ptr; - uint64_t size; - } dcmd_copy; - - struct dcmd_fd_req { - int64_t size; - int32_t fd; - uint8_t pad[4]; - } dcmd_fd_io; - - struct dcmd_reply { - uint64_t size; /* used by reply to DCTL_COPYINSTR, - DCTL_FD_READ and DCTL_FD_WRITE */ - int32_t rc; /* return code */ - uint8_t pad[4]; - } dcmd_reply; - } u; -} dctl_cmd_t; - -#define DCTL_CMD_HEADER_SIZE (sizeof(uint64_t) + sizeof(uint8_t)) - -/* - * The following definitions are only used by the server code. - */ - -#define LISTEN_BACKLOG 5 - -/* Worker thread data */ -typedef struct wthr_info { - list_node_t wthr_node; - pthread_t wthr_id; - boolean_t wthr_exit; /* termination flag */ - boolean_t wthr_free; -} wthr_info_t; - -/* Control socket data */ -typedef struct dctl_sock_info { - pthread_mutex_t dsi_mtx; - char *dsi_path; - struct sockaddr_un dsi_addr; - int dsi_fd; -} dctl_sock_info_t; - -typedef void *thr_func_t(void *); - -/* Thread pool data */ -typedef struct dctl_thr_info { - thr_func_t *dti_thr_func; - - pthread_mutex_t dti_mtx; /* protects the thread lists and dti_free */ - list_t dti_list; /* list of threads in the thread pool */ - list_t dti_join_list; /* list of threads that are waiting to be - joined */ - int dti_free; /* number of free worker threads */ - - int dti_min; - int dti_max_free; - - boolean_t dti_exit; /* global termination flag */ -} dctl_thr_info_t; - -/* Messaging functions functions */ -int dctl_read_msg(int fd, dctl_cmd_t *cmd); -int dctl_send_msg(int fd, dctl_cmd_t *cmd); - -int dctl_read_data(int fd, void *ptr, size_t size); -int dctl_send_data(int fd, const void *ptr, size_t size); - -/* Thread pool functions */ -int dctl_thr_pool_create(int min_thr, int max_free_thr, - thr_func_t *thr_func); -void dctl_thr_pool_stop(); - -void dctl_thr_join(); -void dctl_thr_die(wthr_info_t *thr); -void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free); - -#endif diff --git a/zfs/lib/libdmu-ctl/rrwlock.c b/zfs/lib/libdmu-ctl/rrwlock.c deleted file mode 100644 index c46ed8155..000000000 --- a/zfs/lib/libdmu-ctl/rrwlock.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)rrwlock.c 1.1 07/10/24 SMI" - -#include <sys/refcount.h> -#include <sys/rrwlock.h> - -/* - * This file contains the implementation of a re-entrant read - * reader/writer lock (aka "rrwlock"). - * - * This is a normal reader/writer lock with the additional feature - * of allowing threads who have already obtained a read lock to - * re-enter another read lock (re-entrant read) - even if there are - * waiting writers. - * - * Callers who have not obtained a read lock give waiting writers priority. - * - * The rrwlock_t lock does not allow re-entrant writers, nor does it - * allow a re-entrant mix of reads and writes (that is, it does not - * allow a caller who has already obtained a read lock to be able to - * then grab a write lock without first dropping all read locks, and - * vice versa). - * - * The rrwlock_t uses tsd (thread specific data) to keep a list of - * nodes (rrw_node_t), where each node keeps track of which specific - * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering - * should be rare, a thread that grabs multiple reads on the same rrwlock_t - * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the - * tsd list can represent a different rrwlock_t. This allows a thread - * to enter multiple and unique rrwlock_ts for read locks at the same time. - * - * Since using tsd exposes some overhead, the rrwlock_t only needs to - * keep tsd data when writers are waiting. If no writers are waiting, then - * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd - * is needed. Once a writer attempts to grab the lock, readers then - * keep tsd data and bump the linked readers count (rr_linked_rcount). - * - * If there are waiting writers and there are anonymous readers, then a - * reader doesn't know if it is a re-entrant lock. But since it may be one, - * we allow the read to proceed (otherwise it could deadlock). Since once - * waiting writers are active, readers no longer bump the anonymous count, - * the anonymous readers will eventually flush themselves out. At this point, - * readers will be able to tell if they are a re-entrant lock (have a - * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then - * we must let the proceed. If they are not, then the reader blocks for the - * waiting writers. Hence, we do not starve writers. - */ - -/* global key for TSD */ -uint_t rrw_tsd_key; - -typedef struct rrw_node { - struct rrw_node *rn_next; - rrwlock_t *rn_rrl; -} rrw_node_t; - -static rrw_node_t * -rrn_find(rrwlock_t *rrl) -{ - rrw_node_t *rn; - - if (refcount_count(&rrl->rr_linked_rcount) == 0) - return (NULL); - - for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl) - return (rn); - } - return (NULL); -} - -/* - * Add a node to the head of the singly linked list. - */ -static void -rrn_add(rrwlock_t *rrl) -{ - rrw_node_t *rn; - - rn = kmem_alloc(sizeof (*rn), KM_SLEEP); - rn->rn_rrl = rrl; - rn->rn_next = tsd_get(rrw_tsd_key); - VERIFY(tsd_set(rrw_tsd_key, rn) == 0); -} - -/* - * If a node is found for 'rrl', then remove the node from this - * thread's list and return TRUE; otherwise return FALSE. - */ -static boolean_t -rrn_find_and_remove(rrwlock_t *rrl) -{ - rrw_node_t *rn; - rrw_node_t *prev = NULL; - - if (refcount_count(&rrl->rr_linked_rcount) == 0) - return (NULL); - - for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl) { - if (prev) - prev->rn_next = rn->rn_next; - else - VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); - kmem_free(rn, sizeof (*rn)); - return (B_TRUE); - } - prev = rn; - } - return (B_FALSE); -} - -void -rrw_init(rrwlock_t *rrl) -{ - mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); - rrl->rr_writer = NULL; - refcount_create(&rrl->rr_anon_rcount); - refcount_create(&rrl->rr_linked_rcount); - rrl->rr_writer_wanted = B_FALSE; -} - -void -rrw_destroy(rrwlock_t *rrl) -{ - mutex_destroy(&rrl->rr_lock); - cv_destroy(&rrl->rr_cv); - ASSERT(rrl->rr_writer == NULL); - refcount_destroy(&rrl->rr_anon_rcount); - refcount_destroy(&rrl->rr_linked_rcount); -} - -static void -rrw_enter_read(rrwlock_t *rrl, void *tag) -{ - mutex_enter(&rrl->rr_lock); - ASSERT(rrl->rr_writer != curthread); - ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); - - while (rrl->rr_writer || (rrl->rr_writer_wanted && - refcount_is_zero(&rrl->rr_anon_rcount) && - rrn_find(rrl) == NULL)) - cv_wait(&rrl->rr_cv, &rrl->rr_lock); - - if (rrl->rr_writer_wanted) { - /* may or may not be a re-entrant enter */ - rrn_add(rrl); - (void) refcount_add(&rrl->rr_linked_rcount, tag); - } else { - (void) refcount_add(&rrl->rr_anon_rcount, tag); - } - ASSERT(rrl->rr_writer == NULL); - mutex_exit(&rrl->rr_lock); -} - -static void -rrw_enter_write(rrwlock_t *rrl) -{ - mutex_enter(&rrl->rr_lock); - ASSERT(rrl->rr_writer != curthread); - - while (refcount_count(&rrl->rr_anon_rcount) > 0 || - refcount_count(&rrl->rr_linked_rcount) > 0 || - rrl->rr_writer != NULL) { - rrl->rr_writer_wanted = B_TRUE; - cv_wait(&rrl->rr_cv, &rrl->rr_lock); - } - rrl->rr_writer_wanted = B_FALSE; - rrl->rr_writer = curthread; - mutex_exit(&rrl->rr_lock); -} - -void -rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) -{ - if (rw == RW_READER) - rrw_enter_read(rrl, tag); - else - rrw_enter_write(rrl); -} - -void -rrw_exit(rrwlock_t *rrl, void *tag) -{ - mutex_enter(&rrl->rr_lock); - ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || - !refcount_is_zero(&rrl->rr_linked_rcount) || - rrl->rr_writer != NULL); - - if (rrl->rr_writer == NULL) { - if (rrn_find_and_remove(rrl)) { - if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - - } else { - if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - } - } else { - ASSERT(rrl->rr_writer == curthread); - ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && - refcount_is_zero(&rrl->rr_linked_rcount)); - rrl->rr_writer = NULL; - cv_broadcast(&rrl->rr_cv); - } - mutex_exit(&rrl->rr_lock); -} - -boolean_t -rrw_held(rrwlock_t *rrl, krw_t rw) -{ - boolean_t held; - - mutex_enter(&rrl->rr_lock); - if (rw == RW_WRITER) { - held = (rrl->rr_writer == curthread); - } else { - held = (!refcount_is_zero(&rrl->rr_anon_rcount) || - !refcount_is_zero(&rrl->rr_linked_rcount)); - } - mutex_exit(&rrl->rr_lock); - - return (held); -} diff --git a/zfs/lib/libdmu-ctl/zfs_acl.c b/zfs/lib/libdmu-ctl/zfs_acl.c deleted file mode 100644 index cc2f97e1b..000000000 --- a/zfs/lib/libdmu-ctl/zfs_acl.c +++ /dev/null @@ -1,2641 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_acl.c 1.25 08/04/08 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/sid.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/kmem.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/unistd.h> -#include <sys/sdt.h> -#include <sys/fs/zfs.h> -#include <sys/mode.h> -#include <sys/policy.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_fuid.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_vfsops.h> -#include <sys/dmu.h> -#include <sys/dnode.h> -#include <sys/zap.h> -#include "fs/fs_subr.h" -#include <acl/acl_common.h> - -#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE -#define DENY ACE_ACCESS_DENIED_ACE_TYPE -#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE - -#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) -#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ - ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) -#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) - -#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ - ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ - ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ - ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) - -#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ - ACE_WRITE_OWNER) - -#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) - -#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) - -#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ - ZFS_ACL_PROTECTED) - -#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ - ZFS_ACL_OBJ_ACE) - -static uint16_t -zfs_ace_v0_get_type(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_type); -} - -static uint16_t -zfs_ace_v0_get_flags(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_flags); -} - -static uint32_t -zfs_ace_v0_get_mask(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_access_mask); -} - -static uint64_t -zfs_ace_v0_get_who(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_fuid); -} - -static void -zfs_ace_v0_set_type(void *acep, uint16_t type) -{ - ((zfs_oldace_t *)acep)->z_type = type; -} - -static void -zfs_ace_v0_set_flags(void *acep, uint16_t flags) -{ - ((zfs_oldace_t *)acep)->z_flags = flags; -} - -static void -zfs_ace_v0_set_mask(void *acep, uint32_t mask) -{ - ((zfs_oldace_t *)acep)->z_access_mask = mask; -} - -static void -zfs_ace_v0_set_who(void *acep, uint64_t who) -{ - ((zfs_oldace_t *)acep)->z_fuid = who; -} - -/*ARGSUSED*/ -static size_t -zfs_ace_v0_size(void *acep) -{ - return (sizeof (zfs_oldace_t)); -} - -static size_t -zfs_ace_v0_abstract_size(void) -{ - return (sizeof (zfs_oldace_t)); -} - -static int -zfs_ace_v0_mask_off(void) -{ - return (offsetof(zfs_oldace_t, z_access_mask)); -} - -/*ARGSUSED*/ -static int -zfs_ace_v0_data(void *acep, void **datap) -{ - *datap = NULL; - return (0); -} - -static acl_ops_t zfs_acl_v0_ops = { - zfs_ace_v0_get_mask, - zfs_ace_v0_set_mask, - zfs_ace_v0_get_flags, - zfs_ace_v0_set_flags, - zfs_ace_v0_get_type, - zfs_ace_v0_set_type, - zfs_ace_v0_get_who, - zfs_ace_v0_set_who, - zfs_ace_v0_size, - zfs_ace_v0_abstract_size, - zfs_ace_v0_mask_off, - zfs_ace_v0_data -}; - -static uint16_t -zfs_ace_fuid_get_type(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_type); -} - -static uint16_t -zfs_ace_fuid_get_flags(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_flags); -} - -static uint32_t -zfs_ace_fuid_get_mask(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_access_mask); -} - -static uint64_t -zfs_ace_fuid_get_who(void *args) -{ - uint16_t entry_type; - zfs_ace_t *acep = args; - - entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; - - if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return (-1); - return (((zfs_ace_t *)acep)->z_fuid); -} - -static void -zfs_ace_fuid_set_type(void *acep, uint16_t type) -{ - ((zfs_ace_hdr_t *)acep)->z_type = type; -} - -static void -zfs_ace_fuid_set_flags(void *acep, uint16_t flags) -{ - ((zfs_ace_hdr_t *)acep)->z_flags = flags; -} - -static void -zfs_ace_fuid_set_mask(void *acep, uint32_t mask) -{ - ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; -} - -static void -zfs_ace_fuid_set_who(void *arg, uint64_t who) -{ - zfs_ace_t *acep = arg; - - uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; - - if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return; - acep->z_fuid = who; -} - -static size_t -zfs_ace_fuid_size(void *acep) -{ - zfs_ace_hdr_t *zacep = acep; - uint16_t entry_type; - - switch (zacep->z_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - return (sizeof (zfs_object_ace_t)); - case ALLOW: - case DENY: - entry_type = - (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); - if (entry_type == ACE_OWNER || - entry_type == (ACE_GROUP | ACE_IDENTIFIER_GROUP) || - entry_type == ACE_EVERYONE) - return (sizeof (zfs_ace_hdr_t)); - /*FALLTHROUGH*/ - default: - return (sizeof (zfs_ace_t)); - } -} - -static size_t -zfs_ace_fuid_abstract_size(void) -{ - return (sizeof (zfs_ace_hdr_t)); -} - -static int -zfs_ace_fuid_mask_off(void) -{ - return (offsetof(zfs_ace_hdr_t, z_access_mask)); -} - -static int -zfs_ace_fuid_data(void *acep, void **datap) -{ - zfs_ace_t *zacep = acep; - zfs_object_ace_t *zobjp; - - switch (zacep->z_hdr.z_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - zobjp = acep; - *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); - return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); - default: - *datap = NULL; - return (0); - } -} - -static acl_ops_t zfs_acl_fuid_ops = { - zfs_ace_fuid_get_mask, - zfs_ace_fuid_set_mask, - zfs_ace_fuid_get_flags, - zfs_ace_fuid_set_flags, - zfs_ace_fuid_get_type, - zfs_ace_fuid_set_type, - zfs_ace_fuid_get_who, - zfs_ace_fuid_set_who, - zfs_ace_fuid_size, - zfs_ace_fuid_abstract_size, - zfs_ace_fuid_mask_off, - zfs_ace_fuid_data -}; - -static int -zfs_acl_version(int version) -{ - if (version < ZPL_VERSION_FUID) - return (ZFS_ACL_VERSION_INITIAL); - else - return (ZFS_ACL_VERSION_FUID); -} - -static int -zfs_acl_version_zp(znode_t *zp) -{ - return (zfs_acl_version(zp->z_zfsvfs->z_version)); -} - -static zfs_acl_t * -zfs_acl_alloc(int vers) -{ - zfs_acl_t *aclp; - - aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); - list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), - offsetof(zfs_acl_node_t, z_next)); - aclp->z_version = vers; - if (vers == ZFS_ACL_VERSION_FUID) - aclp->z_ops = zfs_acl_fuid_ops; - else - aclp->z_ops = zfs_acl_v0_ops; - return (aclp); -} - -static zfs_acl_node_t * -zfs_acl_node_alloc(size_t bytes) -{ - zfs_acl_node_t *aclnode; - - aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); - if (bytes) { - aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); - aclnode->z_allocdata = aclnode->z_acldata; - aclnode->z_allocsize = bytes; - aclnode->z_size = bytes; - } - - return (aclnode); -} - -static void -zfs_acl_node_free(zfs_acl_node_t *aclnode) -{ - if (aclnode->z_allocsize) - kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); - kmem_free(aclnode, sizeof (zfs_acl_node_t)); -} - -static void -zfs_acl_release_nodes(zfs_acl_t *aclp) -{ - zfs_acl_node_t *aclnode; - - while (aclnode = list_head(&aclp->z_acl)) { - list_remove(&aclp->z_acl, aclnode); - zfs_acl_node_free(aclnode); - } - aclp->z_acl_count = 0; - aclp->z_acl_bytes = 0; -} - -void -zfs_acl_free(zfs_acl_t *aclp) -{ - zfs_acl_release_nodes(aclp); - list_destroy(&aclp->z_acl); - kmem_free(aclp, sizeof (zfs_acl_t)); -} - -static boolean_t -zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) -{ - /* - * first check type of entry - */ - - switch (iflags & ACE_TYPE_FLAGS) { - case ACE_OWNER: - case (ACE_IDENTIFIER_GROUP | ACE_GROUP): - case ACE_IDENTIFIER_GROUP: - case ACE_EVERYONE: - case 0: /* User entry */ - break; - default: - return (B_FALSE); - - } - - /* - * next check inheritance level flags - */ - - if (type != ALLOW && type > MAX_ACE_TYPE) { - return (B_FALSE); - } - - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - if (aclp->z_version < ZFS_ACL_VERSION_FUID) - return (B_FALSE); - aclp->z_hints |= ZFS_ACL_OBJ_ACE; - } - - /* - * Only directories should have inheritance flags. - */ - if (obj_type != VDIR && (iflags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| - ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) { - return (B_FALSE); - } - - if (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) == 0) { - return (B_FALSE); - } - } - - return (B_TRUE); -} - -static void * -zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, - uint32_t *access_mask, uint16_t *iflags, uint16_t *type) -{ - zfs_acl_node_t *aclnode; - - if (start == NULL) { - aclnode = list_head(&aclp->z_acl); - if (aclnode == NULL) - return (NULL); - - aclp->z_next_ace = aclnode->z_acldata; - aclp->z_curr_node = aclnode; - aclnode->z_ace_idx = 0; - } - - aclnode = aclp->z_curr_node; - - if (aclnode == NULL) - return (NULL); - - if (aclnode->z_ace_idx >= aclnode->z_ace_count) { - aclnode = list_next(&aclp->z_acl, aclnode); - if (aclnode == NULL) - return (NULL); - else { - aclp->z_curr_node = aclnode; - aclnode->z_ace_idx = 0; - aclp->z_next_ace = aclnode->z_acldata; - } - } - - if (aclnode->z_ace_idx < aclnode->z_ace_count) { - void *acep = aclp->z_next_ace; - *iflags = aclp->z_ops.ace_flags_get(acep); - *type = aclp->z_ops.ace_type_get(acep); - *access_mask = aclp->z_ops.ace_mask_get(acep); - *who = aclp->z_ops.ace_who_get(acep); - aclp->z_next_ace = (caddr_t)aclp->z_next_ace + - aclp->z_ops.ace_size(acep); - aclnode->z_ace_idx++; - return ((void *)acep); - } - return (NULL); -} - -/*ARGSUSED*/ -static uint64_t -zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, - uint16_t *flags, uint16_t *type, uint32_t *mask) -{ - zfs_acl_t *aclp = datap; - zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; - uint64_t who; - - acep = zfs_acl_next_ace(aclp, acep, &who, mask, - flags, type); - return ((uint64_t)(uintptr_t)acep); -} - -static zfs_acl_node_t * -zfs_acl_curr_node(zfs_acl_t *aclp) -{ - ASSERT(aclp->z_curr_node); - return (aclp->z_curr_node); -} - -/* - * Copy ACE to internal ZFS format. - * While processing the ACL each ACE will be validated for correctness. - * ACE FUIDs will be created later. - */ -int -zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, - zfs_ace_t *z_acl, int aclcnt, size_t *size) -{ - int i; - uint16_t entry_type; - zfs_ace_t *aceptr = z_acl; - ace_t *acep = datap; - zfs_object_ace_t *zobjacep; - ace_object_t *aceobjp; - - for (i = 0; i != aclcnt; i++) { - aceptr->z_hdr.z_access_mask = acep->a_access_mask; - aceptr->z_hdr.z_flags = acep->a_flags; - aceptr->z_hdr.z_type = acep->a_type; - entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; - if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && - entry_type != ACE_EVERYONE) { - if (!aclp->z_has_fuids) - aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); - aceptr->z_fuid = (uint64_t)acep->a_who; - } - - /* - * Make sure ACE is valid - */ - if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, - aceptr->z_hdr.z_flags) != B_TRUE) - return (EINVAL); - - switch (acep->a_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - zobjacep = (zfs_object_ace_t *)aceptr; - aceobjp = (ace_object_t *)acep; - - bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, - sizeof (aceobjp->a_obj_type)); - bcopy(aceobjp->a_inherit_obj_type, - zobjacep->z_inherit_type, - sizeof (aceobjp->a_inherit_obj_type)); - acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); - break; - default: - acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); - } - - aceptr = (zfs_ace_t *)((caddr_t)aceptr + - aclp->z_ops.ace_size(aceptr)); - } - - *size = (caddr_t)aceptr - (caddr_t)z_acl; - - return (0); -} - -/* - * Copy ZFS ACEs to fixed size ace_t layout - */ -static void -zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, - void *datap, int filter) -{ - uint64_t who; - uint32_t access_mask; - uint16_t iflags, type; - zfs_ace_hdr_t *zacep = NULL; - ace_t *acep = datap; - ace_object_t *objacep; - zfs_object_ace_t *zobjacep; - size_t ace_size; - uint16_t entry_type; - - while (zacep = zfs_acl_next_ace(aclp, zacep, - &who, &access_mask, &iflags, &type)) { - - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - if (filter) { - continue; - } - zobjacep = (zfs_object_ace_t *)zacep; - objacep = (ace_object_t *)acep; - bcopy(zobjacep->z_object_type, - objacep->a_obj_type, - sizeof (zobjacep->z_object_type)); - bcopy(zobjacep->z_inherit_type, - objacep->a_inherit_obj_type, - sizeof (zobjacep->z_inherit_type)); - ace_size = sizeof (ace_object_t); - break; - default: - ace_size = sizeof (ace_t); - break; - } - - entry_type = (iflags & ACE_TYPE_FLAGS); - if ((entry_type != ACE_OWNER && - entry_type != (ACE_GROUP | ACE_IDENTIFIER_GROUP) && - entry_type != ACE_EVERYONE)) { - acep->a_who = zfs_fuid_map_id(zfsvfs, who, - cr, (entry_type & ACE_IDENTIFIER_GROUP) ? - ZFS_ACE_GROUP : ZFS_ACE_USER); - } else { - acep->a_who = (uid_t)(int64_t)who; - } - acep->a_access_mask = access_mask; - acep->a_flags = iflags; - acep->a_type = type; - acep = (ace_t *)((caddr_t)acep + ace_size); - } -} - -static int -zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, - zfs_oldace_t *z_acl, int aclcnt, size_t *size) -{ - int i; - zfs_oldace_t *aceptr = z_acl; - - for (i = 0; i != aclcnt; i++, aceptr++) { - aceptr->z_access_mask = acep[i].a_access_mask; - aceptr->z_type = acep[i].a_type; - aceptr->z_flags = acep[i].a_flags; - aceptr->z_fuid = acep[i].a_who; - /* - * Make sure ACE is valid - */ - if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, - aceptr->z_flags) != B_TRUE) - return (EINVAL); - } - *size = (caddr_t)aceptr - (caddr_t)z_acl; - return (0); -} - -/* - * convert old ACL format to new - */ -void -zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) -{ - zfs_oldace_t *oldaclp; - int i; - uint16_t type, iflags; - uint32_t access_mask; - uint64_t who; - void *cookie = NULL; - zfs_acl_node_t *newaclnode; - - ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); - /* - * First create the ACE in a contiguous piece of memory - * for zfs_copy_ace_2_fuid(). - * - * We only convert an ACL once, so this won't happen - * everytime. - */ - oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, - KM_SLEEP); - i = 0; - while (cookie = zfs_acl_next_ace(aclp, cookie, &who, - &access_mask, &iflags, &type)) { - oldaclp[i].z_flags = iflags; - oldaclp[i].z_type = type; - oldaclp[i].z_fuid = who; - oldaclp[i++].z_access_mask = access_mask; - } - - newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * - sizeof (zfs_object_ace_t)); - aclp->z_ops = zfs_acl_fuid_ops; - VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, - newaclnode->z_acldata, aclp->z_acl_count, - &newaclnode->z_size) == 0); - newaclnode->z_ace_count = aclp->z_acl_count; - aclp->z_version = ZFS_ACL_VERSION; - kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); - - /* - * Release all previous ACL nodes - */ - - zfs_acl_release_nodes(aclp); - - list_insert_head(&aclp->z_acl, newaclnode); - - aclp->z_acl_bytes = newaclnode->z_size; - aclp->z_acl_count = newaclnode->z_ace_count; - -} - -/* - * Convert unix access mask to v4 access mask - */ -static uint32_t -zfs_unix_to_v4(uint32_t access_mask) -{ - uint32_t new_mask = 0; - - if (access_mask & S_IXOTH) - new_mask |= ACE_EXECUTE; - if (access_mask & S_IWOTH) - new_mask |= ACE_WRITE_DATA; - if (access_mask & S_IROTH) - new_mask |= ACE_READ_DATA; - return (new_mask); -} - -static void -zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, - uint16_t access_type, uint64_t fuid, uint16_t entry_type) -{ - uint16_t type = entry_type & ACE_TYPE_FLAGS; - - aclp->z_ops.ace_mask_set(acep, access_mask); - aclp->z_ops.ace_type_set(acep, access_type); - aclp->z_ops.ace_flags_set(acep, entry_type); - if ((type != ACE_OWNER && type != (ACE_GROUP | ACE_IDENTIFIER_GROUP) && - type != ACE_EVERYONE)) - aclp->z_ops.ace_who_set(acep, fuid); -} - -/* - * Determine mode of file based on ACL. - * Also, create FUIDs for any User/Group ACEs - */ -static uint64_t -zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) -{ - int entry_type; - mode_t mode; - mode_t seen = 0; - zfs_ace_hdr_t *acep = NULL; - uint64_t who; - uint16_t iflags, type; - uint32_t access_mask; - - mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); - - while (acep = zfs_acl_next_ace(aclp, acep, &who, - &access_mask, &iflags, &type)) { - - /* - * Skip over inherit only ACEs - */ - if (iflags & ACE_INHERIT_ONLY_ACE) - continue; - - entry_type = (iflags & ACE_TYPE_FLAGS); - - if (entry_type == ACE_OWNER) { - if ((access_mask & ACE_READ_DATA) && - (!(seen & S_IRUSR))) { - seen |= S_IRUSR; - if (type == ALLOW) { - mode |= S_IRUSR; - } - } - if ((access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWUSR))) { - seen |= S_IWUSR; - if (type == ALLOW) { - mode |= S_IWUSR; - } - } - if ((access_mask & ACE_EXECUTE) && - (!(seen & S_IXUSR))) { - seen |= S_IXUSR; - if (type == ALLOW) { - mode |= S_IXUSR; - } - } - } else if (entry_type == OWNING_GROUP) { - if ((access_mask & ACE_READ_DATA) && - (!(seen & S_IRGRP))) { - seen |= S_IRGRP; - if (type == ALLOW) { - mode |= S_IRGRP; - } - } - if ((access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWGRP))) { - seen |= S_IWGRP; - if (type == ALLOW) { - mode |= S_IWGRP; - } - } - if ((access_mask & ACE_EXECUTE) && - (!(seen & S_IXGRP))) { - seen |= S_IXGRP; - if (type == ALLOW) { - mode |= S_IXGRP; - } - } - } else if (entry_type == ACE_EVERYONE) { - if ((access_mask & ACE_READ_DATA)) { - if (!(seen & S_IRUSR)) { - seen |= S_IRUSR; - if (type == ALLOW) { - mode |= S_IRUSR; - } - } - if (!(seen & S_IRGRP)) { - seen |= S_IRGRP; - if (type == ALLOW) { - mode |= S_IRGRP; - } - } - if (!(seen & S_IROTH)) { - seen |= S_IROTH; - if (type == ALLOW) { - mode |= S_IROTH; - } - } - } - if ((access_mask & ACE_WRITE_DATA)) { - if (!(seen & S_IWUSR)) { - seen |= S_IWUSR; - if (type == ALLOW) { - mode |= S_IWUSR; - } - } - if (!(seen & S_IWGRP)) { - seen |= S_IWGRP; - if (type == ALLOW) { - mode |= S_IWGRP; - } - } - if (!(seen & S_IWOTH)) { - seen |= S_IWOTH; - if (type == ALLOW) { - mode |= S_IWOTH; - } - } - } - if ((access_mask & ACE_EXECUTE)) { - if (!(seen & S_IXUSR)) { - seen |= S_IXUSR; - if (type == ALLOW) { - mode |= S_IXUSR; - } - } - if (!(seen & S_IXGRP)) { - seen |= S_IXGRP; - if (type == ALLOW) { - mode |= S_IXGRP; - } - } - if (!(seen & S_IXOTH)) { - seen |= S_IXOTH; - if (type == ALLOW) { - mode |= S_IXOTH; - } - } - } - } - /* - * Now handle FUID create for user/group ACEs - */ - if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { - aclp->z_ops.ace_who_set(acep, - zfs_fuid_create(zp->z_zfsvfs, who, cr, - (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, - tx, fuidp)); - } - } - return (mode); -} - -static zfs_acl_t * -zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - - /* - * Version 0 to 1 znode_acl_phys has the size/count fields swapped. - * Version 0 didn't have a size field, only a count. - */ - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; - aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); - } else { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; - } - - aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); - aclnode->z_ace_count = aclp->z_acl_count; - if (will_modify) { - bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, - aclp->z_acl_bytes); - } else { - aclnode->z_size = aclp->z_acl_bytes; - aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; - } - - list_insert_head(&aclp->z_acl, aclnode); - - return (aclp); -} - -/* - * Read an external acl object. - */ -static int -zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) -{ - uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; - zfs_acl_t *aclp; - size_t aclsize; - size_t acl_count; - zfs_acl_node_t *aclnode; - int error; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - - if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { - *aclpp = zfs_acl_node_read_internal(zp, will_modify); - return (0); - } - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - zfs_acl_phys_v0_t *zacl0 = - (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; - - aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); - acl_count = zacl0->z_acl_count; - } else { - aclsize = zp->z_phys->zp_acl.z_acl_size; - acl_count = zp->z_phys->zp_acl.z_acl_count; - if (aclsize == 0) - aclsize = acl_count * sizeof (zfs_ace_t); - } - aclnode = zfs_acl_node_alloc(aclsize); - list_insert_head(&aclp->z_acl, aclnode); - error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata); - aclnode->z_ace_count = acl_count; - aclp->z_acl_count = acl_count; - aclp->z_acl_bytes = aclsize; - - if (error != 0) { - zfs_acl_free(aclp); - return (error); - } - - *aclpp = aclp; - return (0); -} - -/* - * common code for setting ACLs. - * - * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. - * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's - * already checked the acl and knows whether to inherit. - */ -int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) -{ - int error; - znode_phys_t *zphys = zp->z_phys; - zfs_acl_phys_t *zacl = &zphys->zp_acl; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; - uint64_t off = 0; - dmu_object_type_t otype; - zfs_acl_node_t *aclnode; - - ASSERT(MUTEX_HELD(&zp->z_lock)); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - - dmu_buf_will_dirty(zp->z_dbuf, tx); - - zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); - - /* - * Decide which opbject type to use. If we are forced to - * use old ACL format than transform ACL into zfs_oldace_t - * layout. - */ - if (!zfsvfs->z_use_fuids) { - otype = DMU_OT_OLDACL; - } else { - if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && - (zfsvfs->z_version >= ZPL_VERSION_FUID)) - zfs_acl_xform(zp, aclp); - ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); - otype = DMU_OT_ACL; - } - - if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - /* - * If ACL was previously external and we are now - * converting to new ACL format then release old - * ACL object and create a new one. - */ - if (aoid && aclp->z_version != zacl->z_acl_version) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - aoid = 0; - } - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - otype, aclp->z_acl_bytes, - otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, - otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); - } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, - aclp->z_acl_bytes, 0, tx); - } - zphys->zp_acl.z_acl_extern_obj = aoid; - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); - off += aclnode->z_size; - } - } else { - void *start = zacl->z_ace_data; - /* - * Migrating back embedded? - */ - if (zphys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - zphys->zp_acl.z_acl_extern_obj = 0; - } - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - bcopy(aclnode->z_acldata, start, aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - } - - /* - * If Old version then swap count/bytes to match old - * layout of znode_acl_phys_t. - */ - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - zphys->zp_acl.z_acl_size = aclp->z_acl_count; - zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; - } else { - zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; - zphys->zp_acl.z_acl_count = aclp->z_acl_count; - } - - zphys->zp_acl.z_acl_version = aclp->z_version; - - /* - * Replace ACL wide bits, but first clear them. - */ - zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; - - zp->z_phys->zp_flags |= aclp->z_hints; - - if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) - zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - return (0); -} - -/* - * Update access mask for prepended ACE - * - * This applies the "groupmask" value for aclmode property. - */ -static void -zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, - mode_t mode, uint64_t owner) -{ - int rmask, wmask, xmask; - int user_ace; - uint16_t aceflags; - uint32_t origmask, acepmask; - uint64_t fuid; - - aceflags = aclp->z_ops.ace_flags_get(acep); - fuid = aclp->z_ops.ace_who_get(acep); - origmask = aclp->z_ops.ace_mask_get(origacep); - acepmask = aclp->z_ops.ace_mask_get(acep); - - user_ace = (!(aceflags & - (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); - - if (user_ace && (fuid == owner)) { - rmask = S_IRUSR; - wmask = S_IWUSR; - xmask = S_IXUSR; - } else { - rmask = S_IRGRP; - wmask = S_IWGRP; - xmask = S_IXGRP; - } - - if (origmask & ACE_READ_DATA) { - if (mode & rmask) { - acepmask &= ~ACE_READ_DATA; - } else { - acepmask |= ACE_READ_DATA; - } - } - - if (origmask & ACE_WRITE_DATA) { - if (mode & wmask) { - acepmask &= ~ACE_WRITE_DATA; - } else { - acepmask |= ACE_WRITE_DATA; - } - } - - if (origmask & ACE_APPEND_DATA) { - if (mode & wmask) { - acepmask &= ~ACE_APPEND_DATA; - } else { - acepmask |= ACE_APPEND_DATA; - } - } - - if (origmask & ACE_EXECUTE) { - if (mode & xmask) { - acepmask &= ~ACE_EXECUTE; - } else { - acepmask |= ACE_EXECUTE; - } - } - aclp->z_ops.ace_mask_set(acep, acepmask); -} - -/* - * Apply mode to canonical six ACEs. - */ -static void -zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) -{ - zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); - void *acep; - int maskoff = aclp->z_ops.ace_mask_off(); - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - - ASSERT(aclnode != NULL); - - acep = (void *)((caddr_t)aclnode->z_acldata + - aclnode->z_size - (abstract_size * 6)); - - /* - * Fixup final ACEs to match the mode - */ - - adjust_ace_pair_common(acep, maskoff, abstract_size, - (mode & 0700) >> 6); /* owner@ */ - - acep = (caddr_t)acep + (abstract_size * 2); - - adjust_ace_pair_common(acep, maskoff, abstract_size, - (mode & 0070) >> 3); /* group@ */ - - acep = (caddr_t)acep + (abstract_size * 2); - adjust_ace_pair_common(acep, maskoff, - abstract_size, mode); /* everyone@ */ -} - - -static int -zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny, - int entry_type, int accessmask) -{ - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - uint16_t type = aclp->z_ops.ace_type_get(acep); - uint16_t flags = aclp->z_ops.ace_flags_get(acep); - - return (mask == accessmask && type == allow_deny && - ((flags & ACE_TYPE_FLAGS) == entry_type)); -} - -/* - * Can prepended ACE be reused? - */ -static int -zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep) -{ - int okay_masks; - uint16_t prevtype; - uint16_t prevflags; - uint16_t flags; - uint32_t mask, prevmask; - - if (prevacep == NULL) - return (B_FALSE); - - prevtype = aclp->z_ops.ace_type_get(prevacep); - prevflags = aclp->z_ops.ace_flags_get(prevacep); - flags = aclp->z_ops.ace_flags_get(acep); - mask = aclp->z_ops.ace_mask_get(acep); - prevmask = aclp->z_ops.ace_mask_get(prevacep); - - if (prevtype != DENY) - return (B_FALSE); - - if (prevflags != (flags & ACE_IDENTIFIER_GROUP)) - return (B_FALSE); - - okay_masks = (mask & OKAY_MASK_BITS); - - if (prevmask & ~okay_masks) - return (B_FALSE); - - return (B_TRUE); -} - - -/* - * Insert new ACL node into chain of zfs_acl_node_t's - * - * This will result in two possible results. - * 1. If the ACL is currently just a single zfs_acl_node and - * we are prepending the entry then current acl node will have - * a new node inserted above it. - * - * 2. If we are inserting in the middle of current acl node then - * the current node will be split in two and new node will be inserted - * in between the two split nodes. - */ -static zfs_acl_node_t * -zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) -{ - zfs_acl_node_t *newnode; - zfs_acl_node_t *trailernode = NULL; - zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp); - int curr_idx = aclp->z_curr_node->z_ace_idx; - int trailer_count; - size_t oldsize; - - newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep)); - newnode->z_ace_count = 1; - - oldsize = currnode->z_size; - - if (curr_idx != 1) { - trailernode = zfs_acl_node_alloc(0); - trailernode->z_acldata = acep; - - trailer_count = currnode->z_ace_count - curr_idx + 1; - currnode->z_ace_count = curr_idx - 1; - currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata; - trailernode->z_size = oldsize - currnode->z_size; - trailernode->z_ace_count = trailer_count; - } - - aclp->z_acl_count += 1; - aclp->z_acl_bytes += aclp->z_ops.ace_size(acep); - - if (curr_idx == 1) - list_insert_before(&aclp->z_acl, currnode, newnode); - else - list_insert_after(&aclp->z_acl, currnode, newnode); - if (trailernode) { - list_insert_after(&aclp->z_acl, newnode, trailernode); - aclp->z_curr_node = trailernode; - trailernode->z_ace_idx = 1; - } - - return (newnode); -} - -/* - * Prepend deny ACE - */ -static void * -zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, - mode_t mode) -{ - zfs_acl_node_t *aclnode; - void *newacep; - uint64_t fuid; - uint16_t flags; - - aclnode = zfs_acl_ace_insert(aclp, acep); - newacep = aclnode->z_acldata; - fuid = aclp->z_ops.ace_who_get(acep); - flags = aclp->z_ops.ace_flags_get(acep); - zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); - - return (newacep); -} - -/* - * Split an inherited ACE into inherit_only ACE - * and original ACE with inheritance flags stripped off. - */ -static void -zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep) -{ - zfs_acl_node_t *aclnode; - zfs_acl_node_t *currnode; - void *newacep; - uint16_t type, flags; - uint32_t mask; - uint64_t fuid; - - type = aclp->z_ops.ace_type_get(acep); - flags = aclp->z_ops.ace_flags_get(acep); - mask = aclp->z_ops.ace_mask_get(acep); - fuid = aclp->z_ops.ace_who_get(acep); - - aclnode = zfs_acl_ace_insert(aclp, acep); - newacep = aclnode->z_acldata; - - aclp->z_ops.ace_type_set(newacep, type); - aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE); - aclp->z_ops.ace_mask_set(newacep, mask); - aclp->z_ops.ace_type_set(newacep, type); - aclp->z_ops.ace_who_set(newacep, fuid); - aclp->z_next_ace = acep; - flags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep, flags); - currnode = zfs_acl_curr_node(aclp); - ASSERT(currnode->z_ace_idx >= 1); - currnode->z_ace_idx -= 1; -} - -/* - * Are ACES started at index i, the canonical six ACES? - */ -static int -zfs_have_canonical_six(zfs_acl_t *aclp) -{ - void *acep; - zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); - int i = 0; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - - ASSERT(aclnode != NULL); - - if (aclnode->z_ace_count < 6) - return (0); - - acep = (void *)((caddr_t)aclnode->z_acldata + - aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6)); - - if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - DENY, ACE_OWNER, 0) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, - OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep + - (abstract_size * i++), - ALLOW, OWNING_GROUP, 0) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) { - return (1); - } else { - return (0); - } -} - - -/* - * Apply step 1g, to group entries - * - * Need to deal with corner case where group may have - * greater permissions than owner. If so then limit - * group permissions, based on what extra permissions - * group has. - */ -static void -zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, - mode_t mode) -{ - uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep); - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep); - mode_t extramode = (mode >> 3) & 07; - mode_t ownermode = (mode >> 6); - - if (prevflags & ACE_IDENTIFIER_GROUP) { - - extramode &= ~ownermode; - - if (extramode) { - if (extramode & S_IROTH) { - prevmask &= ~ACE_READ_DATA; - mask &= ~ACE_READ_DATA; - } - if (extramode & S_IWOTH) { - prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - } - if (extramode & S_IXOTH) { - prevmask &= ~ACE_EXECUTE; - mask &= ~ACE_EXECUTE; - } - } - } - aclp->z_ops.ace_mask_set(acep, mask); - aclp->z_ops.ace_mask_set(prevacep, prevmask); -} - -/* - * Apply the chmod algorithm as described - * in PSARC/2002/240 - */ -static void -zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - void *acep = NULL, *prevacep = NULL; - uint64_t who; - int i; - int entry_type; - int reuse_deny; - int need_canonical_six = 1; - uint16_t iflags, type; - uint32_t access_mask; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT(MUTEX_HELD(&zp->z_lock)); - - aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); - - /* - * If discard then just discard all ACL nodes which - * represent the ACEs. - * - * New owner@/group@/everone@ ACEs will be added - * later. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) - zfs_acl_release_nodes(aclp); - - while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, - &iflags, &type)) { - - entry_type = (iflags & ACE_TYPE_FLAGS); - iflags = (iflags & ALL_INHERIT); - - if ((type != ALLOW && type != DENY) || - (iflags & ACE_INHERIT_ONLY_ACE)) { - if (iflags) - aclp->z_hints |= ZFS_INHERIT_ACE; - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - aclp->z_hints |= ZFS_ACL_OBJ_ACE; - break; - } - goto nextace; - } - - /* - * Need to split ace into two? - */ - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) && - (!(iflags & ACE_INHERIT_ONLY_ACE))) { - zfs_acl_split_ace(aclp, acep); - aclp->z_hints |= ZFS_INHERIT_ACE; - goto nextace; - } - - if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - (entry_type == OWNING_GROUP)) { - access_mask &= ~OGE_CLEAR; - aclp->z_ops.ace_mask_set(acep, access_mask); - goto nextace; - } else { - reuse_deny = B_TRUE; - if (type == ALLOW) { - - /* - * Check preceding ACE if any, to see - * if we need to prepend a DENY ACE. - * This is only applicable when the acl_mode - * property == groupmask. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { - - reuse_deny = zfs_reuse_deny(aclp, acep, - prevacep); - - if (!reuse_deny) { - prevacep = - zfs_acl_prepend_deny(zp, - aclp, acep, mode); - } else { - zfs_acl_prepend_fixup( - aclp, prevacep, - acep, mode, - zp->z_phys->zp_uid); - } - zfs_fixup_group_entries(aclp, acep, - prevacep, mode); - - } - } - } -nextace: - prevacep = acep; - } - - /* - * Check out last six aces, if we have six. - */ - - if (aclp->z_acl_count >= 6) { - if (zfs_have_canonical_six(aclp)) { - need_canonical_six = 0; - } - } - - if (need_canonical_six) { - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; - zfs_acl_node_t *aclnode = - zfs_acl_node_alloc(abstract_size * 6); - - aclnode->z_size = abstract_size * 6; - aclnode->z_ace_count = 6; - aclp->z_acl_bytes += aclnode->z_size; - list_insert_tail(&aclp->z_acl, aclnode); - - zacep = aclnode->z_acldata; - - i = 0; - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - 0, DENY, -1, ACE_OWNER); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, - DENY, -1, OWNING_GROUP); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, - ALLOW, -1, OWNING_GROUP); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE); - aclp->z_acl_count += 6; - } - - zfs_acl_fixup_canonical_six(aclp, mode); -} - -int -zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) -{ - int error; - - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - *aclp = NULL; - error = zfs_acl_node_read(zp, aclp, B_TRUE); - if (error == 0) - zfs_acl_chmod(zp, mode, *aclp); - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - return (error); -} - -/* - * strip off write_owner and write_acl - */ -static void -zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) -{ - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - - if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && - (aclp->z_ops.ace_type_get(acep) == ALLOW)) { - mask &= ~RESTRICTED_CLEAR; - aclp->z_ops.ace_mask_set(acep, mask); - } -} - -/* - * Should ACE be inherited? - */ -static int -zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) -{ - int vtype = ZTOV(zp)->v_type; - int iflags = (acep_flags & 0xf); - - if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) - return (1); - else if (iflags & ACE_FILE_INHERIT_ACE) - return (!((vtype == VDIR) && - (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); - return (0); -} - -/* - * inherit inheritable ACEs from parent - */ -static zfs_acl_t * -zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, boolean_t *need_chmod) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - void *pacep; - void *acep, *acep2; - zfs_acl_node_t *aclnode, *aclnode2; - zfs_acl_t *aclp = NULL; - uint64_t who; - uint32_t access_mask; - uint16_t iflags, newflags, type; - size_t ace_size; - void *data1, *data2; - size_t data1sz, data2sz; - enum vtype vntype = ZTOV(zp)->v_type; - - *need_chmod = B_TRUE; - pacep = NULL; - aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); - if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { - while (pacep = zfs_acl_next_ace(paclp, pacep, &who, - &access_mask, &iflags, &type)) { - - if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && - type == ALLOW) - continue; - - ace_size = aclp->z_ops.ace_size(pacep); - - if (!zfs_ace_can_use(zp, iflags)) - continue; - - /* - * If owner@, group@, or everyone@ inheritable - * then zfs_acl_chmod() isn't needed. - */ - if (zfsvfs->z_acl_inherit == - ZFS_ACL_PASSTHROUGH && - ((iflags & (ACE_OWNER|ACE_EVERYONE)) || - ((iflags & OWNING_GROUP) == - OWNING_GROUP)) && (vntype == VREG || - (vntype == VDIR && - (iflags & ACE_DIRECTORY_INHERIT_ACE)))) - *need_chmod = B_FALSE; - - aclnode = zfs_acl_node_alloc(ace_size); - list_insert_tail(&aclp->z_acl, aclnode); - acep = aclnode->z_acldata; - zfs_set_ace(aclp, acep, access_mask, type, - who, iflags|ACE_INHERITED_ACE); - - /* - * Copy special opaque data if any - */ - if ((data1sz = paclp->z_ops.ace_data(pacep, - &data1)) != 0) { - VERIFY((data2sz = aclp->z_ops.ace_data(acep, - &data2)) == data1sz); - bcopy(data1, data2, data2sz); - } - aclp->z_acl_count++; - aclnode->z_ace_count++; - aclp->z_acl_bytes += aclnode->z_size; - newflags = aclp->z_ops.ace_flags_get(acep); - - if (vntype == VDIR) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || - (vntype != VDIR)) { - newflags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - zfs_restricted_update(zfsvfs, aclp, acep); - continue; - } - - ASSERT(vntype == VDIR); - - newflags = aclp->z_ops.ace_flags_get(acep); - if ((iflags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) != - ACE_FILE_INHERIT_ACE) { - aclnode2 = zfs_acl_node_alloc(ace_size); - list_insert_tail(&aclp->z_acl, aclnode2); - acep2 = aclnode2->z_acldata; - zfs_set_ace(aclp, acep2, - access_mask, type, who, - iflags|ACE_INHERITED_ACE); - newflags |= ACE_INHERIT_ONLY_ACE; - aclp->z_ops.ace_flags_set(acep, newflags); - newflags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep2, - newflags|ACE_INHERITED_ACE); - - /* - * Copy special opaque data if any - */ - if ((data1sz = aclp->z_ops.ace_data(acep, - &data1)) != 0) { - VERIFY((data2sz = - aclp->z_ops.ace_data(acep2, - &data2)) == data1sz); - bcopy(data1, data2, data1sz); - } - aclp->z_acl_count++; - aclnode2->z_ace_count++; - aclp->z_acl_bytes += aclnode->z_size; - zfs_restricted_update(zfsvfs, aclp, acep2); - } else { - newflags |= ACE_INHERIT_ONLY_ACE; - aclp->z_ops.ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - } - } - } - return (aclp); -} - -/* - * Create file system object initial permissions - * including inheritable ACEs. - */ -void -zfs_perm_init(znode_t *zp, znode_t *parent, int flag, - vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) -{ - uint64_t mode, fuid, fgid; - int error; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zfs_acl_t *aclp = NULL; - zfs_acl_t *paclp; - xvattr_t *xvap = (xvattr_t *)vap; - gid_t gid; - boolean_t need_chmod = B_TRUE; - - if (setaclp) - aclp = setaclp; - - mode = MAKEIMODE(vap->va_type, vap->va_mode); - - /* - * Determine uid and gid. - */ - if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || - ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, - ZFS_OWNER, tx, fuidp); - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); - gid = vap->va_gid; - } else { - fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); - fgid = 0; - if (vap->va_mask & AT_GID) { - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); - gid = vap->va_gid; - if (fgid != parent->z_phys->zp_gid && - !groupmember(vap->va_gid, cr) && - secpolicy_vnode_create_gid(cr) != 0) - fgid = 0; - } - if (fgid == 0) { - if (parent->z_phys->zp_mode & S_ISGID) { - fgid = parent->z_phys->zp_gid; - gid = zfs_fuid_map_id(zfsvfs, fgid, - cr, ZFS_GROUP); - } else { - fgid = zfs_fuid_create_cred(zfsvfs, - ZFS_GROUP, tx, cr, fuidp); - gid = crgetgid(cr); - } - } - } - - /* - * If we're creating a directory, and the parent directory has the - * set-GID bit set, set in on the new directory. - * Otherwise, if the user is neither privileged nor a member of the - * file's new group, clear the file's set-GID bit. - */ - - if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { - mode |= S_ISGID; - } else { - if ((mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) - mode &= ~S_ISGID; - } - - zp->z_phys->zp_uid = fuid; - zp->z_phys->zp_gid = fgid; - zp->z_phys->zp_mode = mode; - - if (aclp == NULL) { - mutex_enter(&parent->z_lock); - if (parent->z_phys->zp_flags & ZFS_INHERIT_ACE) { - mutex_enter(&parent->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); - mutex_exit(&parent->z_acl_lock); - aclp = zfs_acl_inherit(zp, paclp, &need_chmod); - zfs_acl_free(paclp); - } else { - aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); - } - mutex_exit(&parent->z_lock); - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - if (need_chmod) - zfs_acl_chmod(zp, mode, aclp); - } else { - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - } - - /* Force auto_inherit on all new directory objects */ - if (vap->va_type == VDIR) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - - error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); - - /* Set optional attributes if any */ - if (vap->va_mask & AT_XVATTR) - zfs_xvattr_set(zp, xvap); - - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - ASSERT3U(error, ==, 0); - - if (aclp != setaclp) - zfs_acl_free(aclp); -} - -/* - * Retrieve a files ACL - */ -int -zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) -{ - zfs_acl_t *aclp; - ulong_t mask; - int error; - int count = 0; - int largeace = 0; - - mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | - VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); - - if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) - return (error); - - if (mask == 0) - return (ENOSYS); - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - /* - * Scan ACL to determine number of ACEs - */ - if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && - !(mask & VSA_ACE_ALLTYPES)) { - void *zacep = NULL; - uint64_t who; - uint32_t access_mask; - uint16_t type, iflags; - - while (zacep = zfs_acl_next_ace(aclp, zacep, - &who, &access_mask, &iflags, &type)) { - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - largeace++; - continue; - default: - count++; - } - } - vsecp->vsa_aclcnt = count; - } else - count = aclp->z_acl_count; - - if (mask & VSA_ACECNT) { - vsecp->vsa_aclcnt = count; - } - - if (mask & VSA_ACE) { - size_t aclsz; - - zfs_acl_node_t *aclnode = list_head(&aclp->z_acl); - - aclsz = count * sizeof (ace_t) + - sizeof (ace_object_t) * largeace; - - vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); - vsecp->vsa_aclentsz = aclsz; - - if (aclp->z_version == ZFS_ACL_VERSION_FUID) - zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, - vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); - else { - bcopy(aclnode->z_acldata, vsecp->vsa_aclentp, - count * sizeof (ace_t)); - } - } - if (mask & VSA_ACE_ACLFLAGS) { - vsecp->vsa_aclflags = 0; - if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) - vsecp->vsa_aclflags |= ACL_DEFAULTED; - if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) - vsecp->vsa_aclflags |= ACL_PROTECTED; - if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) - vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; - } - - mutex_exit(&zp->z_acl_lock); - - zfs_acl_free(aclp); - - return (0); -} - -int -zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, - vsecattr_t *vsecp, zfs_acl_t **zaclp) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - int aclcnt = vsecp->vsa_aclcnt; - int error; - - if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) - return (EINVAL); - - aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); - - aclp->z_hints = 0; - aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, - (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, - aclcnt, &aclnode->z_size)) != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - return (error); - } - } else { - if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, - vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, - &aclnode->z_size)) != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - return (error); - } - } - aclp->z_acl_bytes = aclnode->z_size; - aclnode->z_ace_count = aclcnt; - aclp->z_acl_count = aclcnt; - list_insert_head(&aclp->z_acl, aclnode); - - /* - * If flags are being set then add them to z_hints - */ - if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { - if (vsecp->vsa_aclflags & ACL_PROTECTED) - aclp->z_hints |= ZFS_ACL_PROTECTED; - if (vsecp->vsa_aclflags & ACL_DEFAULTED) - aclp->z_hints |= ZFS_ACL_DEFAULTED; - if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - } - - *zaclp = aclp; - - return (0); -} - -/* - * Set a files ACL - */ -int -zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); - dmu_tx_t *tx; - int error; - zfs_acl_t *aclp; - zfs_fuid_info_t *fuidp = NULL; - - if (mask == 0) - return (ENOSYS); - - if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) - return (EPERM); - - if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) - return (error); - - error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); - if (error) - return (error); - - /* - * If ACL wide flags aren't being set then preserve any - * existing flags. - */ - if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { - aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); - } -top: - if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { - zfs_acl_free(aclp); - return (error); - } - - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL? */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - zp->z_phys->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, aclp->z_acl_bytes); - } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); - } - if (aclp->z_has_fuids) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - zfs_acl_free(aclp); - return (error); - } - - error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); - ASSERT(error == 0); - - zfs_log_acl(zilog, tx, zp, vsecp, fuidp); - - if (fuidp) - zfs_fuid_info_free(fuidp); - zfs_acl_free(aclp); - dmu_tx_commit(tx); -done: - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - - return (error); -} - -/* - * working_mode returns the permissions that were not granted - */ -static int -zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) -{ - zfs_acl_t *aclp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - uid_t uid = crgetuid(cr); - uint64_t who; - uint16_t type, iflags; - uint16_t entry_type; - uint32_t access_mask; - uint32_t deny_mask = 0; - zfs_ace_hdr_t *acep = NULL; - boolean_t checkit; - uid_t fowner; - uid_t gowner; - - /* - * Short circuit empty requests - */ - if (v4_mode == 0) - return (0); - - *check_privs = B_TRUE; - - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ - *working_mode = 0; - return (0); - } - - *working_mode = v4_mode; - - if ((v4_mode & WRITE_MASK) && - (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && - (!IS_DEVVP(ZTOV(zp)))) { - *check_privs = B_FALSE; - return (EROFS); - } - - /* - * Only check for READONLY on non-directories. - */ - if ((v4_mode & WRITE_MASK_DATA) && - (((ZTOV(zp)->v_type != VDIR) && - (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || - (ZTOV(zp)->v_type == VDIR && - (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { - *check_privs = B_FALSE; - return (EPERM); - } - - if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && - (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { - *check_privs = B_FALSE; - return (EPERM); - } - - if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { - *check_privs = B_FALSE; - return (EACCES); - } - - /* - * The caller requested that the ACL check be skipped. This - * would only happen if the caller checked VOP_ACCESS() with a - * 32 bit ACE mask and already had the appropriate permissions. - */ - if (skipaclchk) { - *working_mode = 0; - return (0); - } - - zfs_fuid_map_ids(zp, cr, &fowner, &gowner); - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, - &iflags, &type)) { - - if (iflags & ACE_INHERIT_ONLY_ACE) - continue; - - entry_type = (iflags & ACE_TYPE_FLAGS); - - checkit = B_FALSE; - - switch (entry_type) { - case ACE_OWNER: - if (uid == fowner) - checkit = B_TRUE; - break; - case OWNING_GROUP: - who = gowner; - /*FALLTHROUGH*/ - case ACE_IDENTIFIER_GROUP: - checkit = zfs_groupmember(zfsvfs, who, cr); - break; - case ACE_EVERYONE: - checkit = B_TRUE; - break; - - /* USER Entry */ - default: - if (entry_type == 0) { - uid_t newid; - - newid = zfs_fuid_map_id(zfsvfs, who, cr, - ZFS_ACE_USER); - if (newid != IDMAP_WK_CREATOR_OWNER_UID && - uid == newid) - checkit = B_TRUE; - break; - } else { - zfs_acl_free(aclp); - mutex_exit(&zp->z_acl_lock); - return (EIO); - } - } - - if (checkit) { - uint32_t mask_matched = (access_mask & *working_mode); - - if (mask_matched) { - if (type == DENY) - deny_mask |= mask_matched; - - *working_mode &= ~mask_matched; - } - } - - /* Are we done? */ - if (*working_mode == 0) - break; - } - - mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); - - /* Put the found 'denies' back on the working mode */ - *working_mode |= deny_mask; - - if (*working_mode) - return (EACCES); - - return (0); -} - -static int -zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - cred_t *cr) -{ - if (*working_mode != ACE_WRITE_DATA) - return (EACCES); - - return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, - check_privs, B_FALSE, cr)); -} - -/* - * Determine whether Access should be granted/denied, invoking least - * priv subsytem when a deny is determined. - */ -int -zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) -{ - uint32_t working_mode; - int error; - int is_attr; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - boolean_t check_privs; - znode_t *xzp; - znode_t *check_zp = zp; - - is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && - (ZTOV(zp)->v_type == VDIR)); - - /* - * If attribute then validate against base file - */ - if (is_attr) { - if ((error = zfs_zget(zp->z_zfsvfs, - zp->z_phys->zp_parent, &xzp)) != 0) { - return (error); - } - - check_zp = xzp; - - /* - * fixup mode to map to xattr perms - */ - - if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { - mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mode |= ACE_WRITE_NAMED_ATTRS; - } - - if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { - mode &= ~(ACE_READ_DATA|ACE_EXECUTE); - mode |= ACE_READ_NAMED_ATTRS; - } - } - - if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, - &check_privs, skipaclchk, cr)) == 0) { - if (is_attr) - VN_RELE(ZTOV(xzp)); - return (0); - } - - if (error && !check_privs) { - if (is_attr) - VN_RELE(ZTOV(xzp)); - return (error); - } - - if (error && (flags & V_APPEND)) { - error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); - } - - if (error && check_privs) { - uid_t owner; - mode_t checkmode = 0; - - owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, - ZFS_OWNER); - - /* - * First check for implicit owner permission on - * read_acl/read_attributes - */ - - error = 0; - ASSERT(working_mode != 0); - - if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && - owner == crgetuid(cr))) - working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); - - if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| - ACE_READ_ACL|ACE_READ_ATTRIBUTES)) - checkmode |= VREAD; - if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| - ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES)) - checkmode |= VWRITE; - if (working_mode & ACE_EXECUTE) - checkmode |= VEXEC; - - if (checkmode) - error = secpolicy_vnode_access(cr, ZTOV(check_zp), - owner, checkmode); - - if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_create_gid(cr); - if (error == 0 && (working_mode & ACE_WRITE_ACL)) - error = secpolicy_vnode_setdac(cr, owner); - - if (error == 0 && (working_mode & - (ACE_DELETE|ACE_DELETE_CHILD))) - error = secpolicy_vnode_remove(cr); - - if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) - error = secpolicy_vnode_owner(cr, owner); - - if (error == 0) { - /* - * See if any bits other than those already checked - * for are still present. If so then return EACCES - */ - if (working_mode & ~(ZFS_CHECKED_MASKS)) { - error = EACCES; - } - } - } - - if (is_attr) - VN_RELE(ZTOV(xzp)); - - return (error); -} - -/* - * Translate traditional unix VREAD/VWRITE/VEXEC mode into - * native ACL format and call zfs_zaccess() - */ -int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) -{ - return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); -} - -/* - * Access function for secpolicy_vnode_setattr - */ -int -zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) -{ - int v4_mode = zfs_unix_to_v4(mode >> 6); - - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); -} - -static int -zfs_delete_final_check(znode_t *zp, znode_t *dzp, - mode_t missing_perms, cred_t *cr) -{ - int error; - uid_t downer; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); - - error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); - - if (error == 0) - error = zfs_sticky_remove_access(dzp, zp, cr); - - return (error); -} - -/* - * Determine whether Access should be granted/deny, without - * consulting least priv subsystem. - * - * - * The following chart is the recommended NFSv4 enforcement for - * ability to delete an object. - * - * ------------------------------------------------------- - * | Parent Dir | Target Object Permissions | - * | permissions | | - * ------------------------------------------------------- - * | | ACL Allows | ACL Denies| Delete | - * | | Delete | Delete | unspecified| - * ------------------------------------------------------- - * | ACL Allows | Permit | Permit | Permit | - * | DELETE_CHILD | | - * ------------------------------------------------------- - * | ACL Denies | Permit | Deny | Deny | - * | DELETE_CHILD | | | | - * ------------------------------------------------------- - * | ACL specifies | | | | - * | only allow | Permit | Permit | Permit | - * | write and | | | | - * | execute | | | | - * ------------------------------------------------------- - * | ACL denies | | | | - * | write and | Permit | Deny | Deny | - * | execute | | | | - * ------------------------------------------------------- - * ^ - * | - * No search privilege, can't even look up file? - * - */ -int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) -{ - uint32_t dzp_working_mode = 0; - uint32_t zp_working_mode = 0; - int dzp_error, zp_error; - mode_t missing_perms; - boolean_t dzpcheck_privs = B_TRUE; - boolean_t zpcheck_privs = B_TRUE; - - /* - * We want specific DELETE permissions to - * take precedence over WRITE/EXECUTE. We don't - * want an ACL such as this to mess us up. - * user:joe:write_data:deny,user:joe:delete:allow - * - * However, deny permissions may ultimately be overridden - * by secpolicy_vnode_access(). - * - * We will ask for all of the necessary permissions and then - * look at the working modes from the directory and target object - * to determine what was found. - */ - - if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) - return (EPERM); - - /* - * If the directory permissions allow the delete, we are done. - */ - if ((dzp_error = zfs_zaccess_common(dzp, - ACE_DELETE_CHILD|ACE_EXECUTE|ACE_WRITE_DATA, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - /* - * If target object has delete permission then we are done - */ - if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, - &zpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - if (!dzpcheck_privs) - return (dzp_error); - else if (!zpcheck_privs) - return (zp_error); - - /* - * First check the first row. - * We only need to see if parent Allows delete_child - */ - if ((dzp_working_mode & ACE_DELETE_CHILD) == 0) - return (0); - - /* - * Second row - * we already have the necessary information in - * zp_working_mode, zp_error and dzp_error. - */ - - if ((zp_working_mode & ACE_DELETE) == 0) - return (0); - - /* - * determine the needed permissions based off of the directories - * working mode - */ - - missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0; - missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0; - - if (dzp_error == EACCES) - return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); - - /* - * Third Row - * only need to see if we have write/execute on directory. - */ - - if (missing_perms == 0) - return (zfs_sticky_remove_access(dzp, zp, cr)); - - /* - * Fourth Row - */ - - if (missing_perms && ((zp_working_mode & ACE_DELETE) == 0)) - return (zfs_sticky_remove_access(dzp, zp, cr)); - - return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); -} - -int -zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) -{ - int add_perm; - int error; - - if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) - return (EACCES); - - add_perm = (ZTOV(szp)->v_type == VDIR) ? - ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; - - /* - * Rename permissions are combination of delete permission + - * add file/subdir permission. - */ - - /* - * first make sure we do the delete portion. - * - * If that succeeds then check for add_file/add_subdir permissions - */ - - if (error = zfs_zaccess_delete(sdzp, szp, cr)) - return (error); - - /* - * If we have a tzp, see if we can delete it? - */ - if (tzp) { - if (error = zfs_zaccess_delete(tdzp, tzp, cr)) - return (error); - } - - /* - * Now check for add permissions - */ - error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); - - return (error); -} diff --git a/zfs/lib/libdmu-ctl/zfs_ctldir.c b/zfs/lib/libdmu-ctl/zfs_ctldir.c deleted file mode 100644 index 45de481c9..000000000 --- a/zfs/lib/libdmu-ctl/zfs_ctldir.c +++ /dev/null @@ -1,1147 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_ctldir.c 1.20 08/04/27 SMI" - -/* - * ZFS control directory (a.k.a. ".zfs") - * - * This directory provides a common location for all ZFS meta-objects. - * Currently, this is only the 'snapshot' directory, but this may expand in the - * future. The elements are built using the GFS primitives, as the hierarchy - * does not actually exist on disk. - * - * For 'snapshot', we don't want to have all snapshots always mounted, because - * this would take up a huge amount of space in /etc/mnttab. We have three - * types of objects: - * - * ctldir ------> snapshotdir -------> snapshot - * | - * | - * V - * mounted fs - * - * The 'snapshot' node contains just enough information to lookup '..' and act - * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we - * perform an automount of the underlying filesystem and return the - * corresponding vnode. - * - * All mounts are handled automatically by the kernel, but unmounts are - * (currently) handled from user land. The main reason is that there is no - * reliable way to auto-unmount the filesystem when it's "no longer in use". - * When the user unmounts a filesystem, we call zfsctl_unmount(), which - * unmounts any snapshots within the snapshot directory. - * - * The '.zfs', '.zfs/snapshot', and all directories created under - * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and - * share the same vfs_t as the head filesystem (what '.zfs' lives under). - * - * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' - * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. - * However, vnodes within these mounted on file systems have their v_vfsp - * fields set to the head filesystem to make NFS happy (see - * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t - * so that it cannot be freed until all snapshots have been unmounted. - */ - -#include <fs/fs_subr.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_vfsops.h> -#include <sys/vfs_opreg.h> -#include <sys/gfs.h> -#include <sys/stat.h> -#include <sys/dmu.h> -#include <sys/dsl_deleg.h> -#include <sys/mount.h> -#include <sys/sunddi.h> - -typedef struct zfsctl_node { - gfs_dir_t zc_gfs_private; - uint64_t zc_id; - timestruc_t zc_cmtime; /* ctime and mtime, always the same */ -} zfsctl_node_t; - -typedef struct zfsctl_snapdir { - zfsctl_node_t sd_node; - kmutex_t sd_lock; - avl_tree_t sd_snaps; -} zfsctl_snapdir_t; - -typedef struct { - char *se_name; - vnode_t *se_root; - avl_node_t se_node; -} zfs_snapentry_t; - -static int -snapentry_compare(const void *a, const void *b) -{ - const zfs_snapentry_t *sa = a; - const zfs_snapentry_t *sb = b; - int ret = strcmp(sa->se_name, sb->se_name); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); -} - -vnodeops_t *zfsctl_ops_root; -vnodeops_t *zfsctl_ops_snapdir; -vnodeops_t *zfsctl_ops_snapshot; - -static const fs_operation_def_t zfsctl_tops_root[]; -static const fs_operation_def_t zfsctl_tops_snapdir[]; -static const fs_operation_def_t zfsctl_tops_snapshot[]; - -static vnode_t *zfsctl_mknode_snapdir(vnode_t *); -static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); -static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); - -static gfs_opsvec_t zfsctl_opsvec[] = { - { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, - { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, - { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, - { NULL } -}; - -/* - * Root directory elements. We have only a single static entry, 'snapshot'. - */ -static gfs_dirent_t zfsctl_root_entries[] = { - { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, - { NULL } -}; - -/* include . and .. in the calculation */ -#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ - sizeof (gfs_dirent_t)) + 1) - - -/* - * Initialize the various GFS pieces we'll need to create and manipulate .zfs - * directories. This is called from the ZFS init routine, and initializes the - * vnode ops vectors that we'll be using. - */ -void -zfsctl_init(void) -{ - VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); -} - -void -zfsctl_fini(void) -{ - /* - * Remove vfsctl vnode ops - */ - if (zfsctl_ops_root) - vn_freevnodeops(zfsctl_ops_root); - if (zfsctl_ops_snapdir) - vn_freevnodeops(zfsctl_ops_snapdir); - if (zfsctl_ops_snapshot) - vn_freevnodeops(zfsctl_ops_snapshot); - - zfsctl_ops_root = NULL; - zfsctl_ops_snapdir = NULL; - zfsctl_ops_snapshot = NULL; -} - -/* - * Return the inode number associated with the 'snapshot' directory. - */ -/* ARGSUSED */ -static ino64_t -zfsctl_root_inode_cb(vnode_t *vp, int index) -{ - ASSERT(index == 0); - return (ZFSCTL_INO_SNAPDIR); -} - -/* - * Create the '.zfs' directory. This directory is cached as part of the VFS - * structure. This results in a hold on the vfs_t. The code in zfs_umount() - * therefore checks against a vfs_count of 2 instead of 1. This reference - * is removed when the ctldir is destroyed in the unmount. - */ -void -zfsctl_create(zfsvfs_t *zfsvfs) -{ - vnode_t *vp, *rvp; - zfsctl_node_t *zcp; - - ASSERT(zfsvfs->z_ctldir == NULL); - - vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, - zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, - zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); - zcp = vp->v_data; - zcp->zc_id = ZFSCTL_INO_ROOT; - - VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0); - ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); - VN_RELE(rvp); - - /* - * We're only faking the fact that we have a root of a filesystem for - * the sake of the GFS interfaces. Undo the flag manipulation it did - * for us. - */ - vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT); - - zfsvfs->z_ctldir = vp; -} - -/* - * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. - * There might still be more references if we were force unmounted, but only - * new zfs_inactive() calls can occur and they don't reference .zfs - */ -void -zfsctl_destroy(zfsvfs_t *zfsvfs) -{ - VN_RELE(zfsvfs->z_ctldir); - zfsvfs->z_ctldir = NULL; -} - -/* - * Given a root znode, retrieve the associated .zfs directory. - * Add a hold to the vnode and return it. - */ -vnode_t * -zfsctl_root(znode_t *zp) -{ - ASSERT(zfs_has_ctldir(zp)); - VN_HOLD(zp->z_zfsvfs->z_ctldir); - return (zp->z_zfsvfs->z_ctldir); -} - -/* - * Common open routine. Disallow any write access. - */ -/* ARGSUSED */ -static int -zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct) -{ - if (flags & FWRITE) - return (EACCES); - - return (0); -} - -/* - * Common close routine. Nothing to do here. - */ -/* ARGSUSED */ -static int -zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off, - cred_t *cr, caller_context_t *ct) -{ - return (0); -} - -/* - * Common access routine. Disallow writes. - */ -/* ARGSUSED */ -static int -zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, - caller_context_t *ct) -{ - if (mode & VWRITE) - return (EACCES); - - return (0); -} - -/* - * Common getattr function. Fill in basic information. - */ -static void -zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) -{ - zfsctl_node_t *zcp = vp->v_data; - timestruc_t now; - - vap->va_uid = 0; - vap->va_gid = 0; - vap->va_rdev = 0; - /* - * We are a purly virtual object, so we have no - * blocksize or allocated blocks. - */ - vap->va_blksize = 0; - vap->va_nblocks = 0; - vap->va_seq = 0; - vap->va_fsid = vp->v_vfsp->vfs_dev; - vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH; - vap->va_type = VDIR; - /* - * We live in the now (for atime). - */ - gethrestime(&now); - vap->va_atime = now; - vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; -} - -/*ARGSUSED*/ -static int -zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_node_t *zcp = vp->v_data; - uint64_t object = zcp->zc_id; - zfid_short_t *zfid; - int i; - - ZFS_ENTER(zfsvfs); - - if (fidp->fid_len < SHORT_FID_LEN) { - fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); - return (ENOSPC); - } - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = SHORT_FID_LEN; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* .zfs znodes always have a generation number of 0 */ - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = 0; - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * .zfs inode namespace - * - * We need to generate unique inode numbers for all files and directories - * within the .zfs pseudo-filesystem. We use the following scheme: - * - * ENTRY ZFSCTL_INODE - * .zfs 1 - * .zfs/snapshot 2 - * .zfs/snapshot/<snap> objectid(snap) - */ - -#define ZFSCTL_INO_SNAP(id) (id) - -/* - * Get root directory attributes. - */ -/* ARGSUSED */ -static int -zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - - ZFS_ENTER(zfsvfs); - vap->va_nodeid = ZFSCTL_INO_ROOT; - vap->va_nlink = vap->va_size = NROOT_ENTRIES; - - zfsctl_common_getattr(vp, vap); - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* - * Special case the handling of "..". - */ -/* ARGSUSED */ -int -zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - int err; - - /* - * No extended attributes allowed under .zfs - */ - if (flags & LOOKUP_XATTR) - return (EINVAL); - - ZFS_ENTER(zfsvfs); - - if (strcmp(nm, "..") == 0) { - err = VFS_ROOT(dvp->v_vfsp, vpp); - } else { - err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, - cr, ct, direntflags, realpnp); - } - - ZFS_EXIT(zfsvfs); - - return (err); -} - -static const fs_operation_def_t zfsctl_tops_root[] = { - { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, - { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, - { VOPNAME_IOCTL, { .error = fs_inval } }, - { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, - { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, - { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, - { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, - { VOPNAME_SEEK, { .vop_seek = fs_seek } }, - { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, - { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, - { NULL } -}; - -static int -zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) -{ - objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; - - dmu_objset_name(os, zname); - if (strlen(zname) + 1 + strlen(name) >= len) - return (ENAMETOOLONG); - (void) strcat(zname, "@"); - (void) strcat(zname, name); - return (0); -} - -static int -zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) -{ - vnode_t *svp = sep->se_root; - int error; - - ASSERT(vn_ismntpt(svp)); - - /* this will be dropped by dounmount() */ - if ((error = vn_vfswlock(svp)) != 0) - return (error); - - VN_HOLD(svp); - error = dounmount(vn_mountedvfs(svp), fflags, cr); - if (error) { - VN_RELE(svp); - return (error); - } - VFS_RELE(svp->v_vfsp); - /* - * We can't use VN_RELE(), as that will try to invoke - * zfsctl_snapdir_inactive(), which would cause us to destroy - * the sd_lock mutex held by our caller. - */ - ASSERT(svp->v_count == 1); - gfs_vop_inactive(svp, cr, NULL); - - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - - return (0); -} - -static void -zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) -{ - avl_index_t where; - vfs_t *vfsp; - refstr_t *pathref; - char newpath[MAXNAMELEN]; - char *tail; - - ASSERT(MUTEX_HELD(&sdp->sd_lock)); - ASSERT(sep != NULL); - - vfsp = vn_mountedvfs(sep->se_root); - ASSERT(vfsp != NULL); - - vfs_lock_wait(vfsp); - - /* - * Change the name in the AVL tree. - */ - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); - (void) strcpy(sep->se_name, nm); - VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); - avl_insert(&sdp->sd_snaps, sep, where); - - /* - * Change the current mountpoint info: - * - update the tail of the mntpoint path - * - update the tail of the resource path - */ - pathref = vfs_getmntpoint(vfsp); - (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); - VERIFY((tail = strrchr(newpath, '/')) != NULL); - *(tail+1) = '\0'; - ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); - (void) strcat(newpath, nm); - refstr_rele(pathref); - vfs_setmntpoint(vfsp, newpath); - - pathref = vfs_getresource(vfsp); - (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); - VERIFY((tail = strrchr(newpath, '@')) != NULL); - *(tail+1) = '\0'; - ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); - (void) strcat(newpath, nm); - refstr_rele(pathref); - vfs_setresource(vfsp, newpath); - - vfs_unlock(vfsp); -} - -/*ARGSUSED*/ -static int -zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, - cred_t *cr, caller_context_t *ct, int flags) -{ - zfsctl_snapdir_t *sdp = sdvp->v_data; - zfs_snapentry_t search, *sep; - zfsvfs_t *zfsvfs; - avl_index_t where; - char from[MAXNAMELEN], to[MAXNAMELEN]; - char real[MAXNAMELEN]; - int err; - - zfsvfs = sdvp->v_vfsp->vfs_data; - ZFS_ENTER(zfsvfs); - - if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, - MAXNAMELEN, NULL); - if (err == 0) { - snm = real; - } else if (err != ENOTSUP) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - ZFS_EXIT(zfsvfs); - - err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); - if (!err) - err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); - if (!err) - err = zfs_secpolicy_rename_perms(from, to, cr); - if (err) - return (err); - - /* - * Cannot move snapshots out of the snapdir. - */ - if (sdvp != tdvp) - return (EINVAL); - - if (strcmp(snm, tnm) == 0) - return (0); - - mutex_enter(&sdp->sd_lock); - - search.se_name = (char *)snm; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { - mutex_exit(&sdp->sd_lock); - return (ENOENT); - } - - err = dmu_objset_rename(from, to, B_FALSE); - if (err == 0) - zfsctl_rename_snap(sdp, sep, tnm); - - mutex_exit(&sdp->sd_lock); - - return (err); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, - caller_context_t *ct, int flags) -{ - zfsctl_snapdir_t *sdp = dvp->v_data; - zfs_snapentry_t *sep; - zfs_snapentry_t search; - zfsvfs_t *zfsvfs; - char snapname[MAXNAMELEN]; - char real[MAXNAMELEN]; - int err; - - zfsvfs = dvp->v_vfsp->vfs_data; - ZFS_ENTER(zfsvfs); - - if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - - err = dmu_snapshot_realname(zfsvfs->z_os, name, real, - MAXNAMELEN, NULL); - if (err == 0) { - name = real; - } else if (err != ENOTSUP) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - ZFS_EXIT(zfsvfs); - - err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); - if (!err) - err = zfs_secpolicy_destroy_perms(snapname, cr); - if (err) - return (err); - - mutex_enter(&sdp->sd_lock); - - search.se_name = name; - sep = avl_find(&sdp->sd_snaps, &search, NULL); - if (sep) { - avl_remove(&sdp->sd_snaps, sep); - err = zfsctl_unmount_snap(sep, MS_FORCE, cr); - if (err) - avl_add(&sdp->sd_snaps, sep); - else - err = dmu_objset_destroy(snapname); - } else { - err = ENOENT; - } - - mutex_exit(&sdp->sd_lock); - - return (err); -} - -/* - * This creates a snapshot under '.zfs/snapshot'. - */ -/* ARGSUSED */ -static int -zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, - cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) -{ - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - char name[MAXNAMELEN]; - int err; - static enum symfollow follow = NO_FOLLOW; - static enum uio_seg seg = UIO_SYSSPACE; - - dmu_objset_name(zfsvfs->z_os, name); - - *vpp = NULL; - - err = zfs_secpolicy_snapshot_perms(name, cr); - if (err) - return (err); - - if (err == 0) { - err = dmu_objset_snapshot(name, dirname, B_FALSE); - if (err) - return (err); - err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); - } - - return (err); -} - -/* - * Lookup entry point for the 'snapshot' directory. Try to open the - * snapshot if it exist, creating the pseudo filesystem vnode as necessary. - * Perform a mount of the associated dataset on top of the vnode. - */ -/* ARGSUSED */ -static int -zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - zfsctl_snapdir_t *sdp = dvp->v_data; - objset_t *snap; - char snapname[MAXNAMELEN]; - char real[MAXNAMELEN]; - char *mountpoint; - zfs_snapentry_t *sep, search; - struct mounta margs; - vfs_t *vfsp; - size_t mountpoint_len; - avl_index_t where; - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - int err; - - /* - * No extended attributes allowed under .zfs - */ - if (flags & LOOKUP_XATTR) - return (EINVAL); - - ASSERT(dvp->v_type == VDIR); - - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) - return (0); - - /* - * If we get a recursive call, that means we got called - * from the domount() code while it was trying to look up the - * spec (which looks like a local path for zfs). We need to - * add some flag to domount() to tell it not to do this lookup. - */ - if (MUTEX_HELD(&sdp->sd_lock)) - return (ENOENT); - - ZFS_ENTER(zfsvfs); - - if (flags & FIGNORECASE) { - boolean_t conflict = B_FALSE; - - err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, - MAXNAMELEN, &conflict); - if (err == 0) { - nm = real; - } else if (err != ENOTSUP) { - ZFS_EXIT(zfsvfs); - return (err); - } - if (realpnp) - (void) strlcpy(realpnp->pn_buf, nm, - realpnp->pn_bufsize); - if (conflict && direntflags) - *direntflags = ED_CASE_CONFLICT; - } - - mutex_enter(&sdp->sd_lock); - search.se_name = (char *)nm; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { - *vpp = sep->se_root; - VN_HOLD(*vpp); - err = traverse(vpp); - if (err) { - VN_RELE(*vpp); - *vpp = NULL; - } else if (*vpp == sep->se_root) { - /* - * The snapshot was unmounted behind our backs, - * try to remount it. - */ - goto domount; - } else { - /* - * VROOT was set during the traverse call. We need - * to clear it since we're pretending to be part - * of our parent's vfs. - */ - (*vpp)->v_flag &= ~VROOT; - } - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (err); - } - - /* - * The requested snapshot is not currently mounted, look it up. - */ - err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); - if (err) { - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (err); - } - if (dmu_objset_open(snapname, DMU_OST_ZFS, - DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (ENOENT); - } - - sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); - sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); - (void) strcpy(sep->se_name, nm); - *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); - avl_insert(&sdp->sd_snaps, sep, where); - - dmu_objset_close(snap); -domount: - mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + - strlen("/.zfs/snapshot/") + strlen(nm) + 1; - mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); - (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", - refstr_value(dvp->v_vfsp->vfs_mntpt), nm); - - margs.spec = snapname; - margs.dir = mountpoint; - margs.flags = MS_SYSSPACE | MS_NOMNTTAB; - margs.fstype = "zfs"; - margs.dataptr = NULL; - margs.datalen = 0; - margs.optptr = NULL; - margs.optlen = 0; - - err = domount("zfs", &margs, *vpp, kcred, &vfsp); - kmem_free(mountpoint, mountpoint_len); - - if (err == 0) { - /* - * Return the mounted root rather than the covered mount point. - * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns - * the ZFS vnode mounted on top of the GFS node. This ZFS - * vnode is the root the newly created vfsp. - */ - VFS_RELE(vfsp); - err = traverse(vpp); - } - - if (err == 0) { - /* - * Fix up the root vnode mounted on .zfs/snapshot/<snapname>. - * - * This is where we lie about our v_vfsp in order to - * make .zfs/snapshot/<snapname> accessible over NFS - * without requiring manual mounts of <snapname>. - */ - ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); - VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; - (*vpp)->v_vfsp = zfsvfs->z_vfs; - (*vpp)->v_flag &= ~VROOT; - } - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - - /* - * If we had an error, drop our hold on the vnode and - * zfsctl_snapshot_inactive() will clean up. - */ - if (err) { - VN_RELE(*vpp); - *vpp = NULL; - } - return (err); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, - offset_t *offp, offset_t *nextp, void *data, int flags) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - char snapname[MAXNAMELEN]; - uint64_t id, cookie; - boolean_t case_conflict; - int error; - - ZFS_ENTER(zfsvfs); - - cookie = *offp; - error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, - &cookie, &case_conflict); - if (error) { - ZFS_EXIT(zfsvfs); - if (error == ENOENT) { - *eofp = 1; - return (0); - } - return (error); - } - - if (flags & V_RDDIR_ENTFLAGS) { - edirent_t *eodp = dp; - - (void) strcpy(eodp->ed_name, snapname); - eodp->ed_ino = ZFSCTL_INO_SNAP(id); - eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; - } else { - struct dirent64 *odp = dp; - - (void) strcpy(odp->d_name, snapname); - odp->d_ino = ZFSCTL_INO_SNAP(id); - } - *nextp = cookie; - - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* - * pvp is the '.zfs' directory (zfsctl_node_t). - * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). - * - * This function is the callback to create a GFS vnode for '.zfs/snapshot' - * when a lookup is performed on .zfs for "snapshot". - */ -vnode_t * -zfsctl_mknode_snapdir(vnode_t *pvp) -{ - vnode_t *vp; - zfsctl_snapdir_t *sdp; - - vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, - zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, - zfsctl_snapdir_readdir_cb, NULL); - sdp = vp->v_data; - sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; - sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; - mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&sdp->sd_snaps, snapentry_compare, - sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); - return (vp); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_snapdir_t *sdp = vp->v_data; - - ZFS_ENTER(zfsvfs); - zfsctl_common_getattr(vp, vap); - vap->va_nodeid = gfs_file_inode(vp); - vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* ARGSUSED */ -static void -zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - zfsctl_snapdir_t *sdp = vp->v_data; - void *private; - - private = gfs_dir_inactive(vp); - if (private != NULL) { - ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); - mutex_destroy(&sdp->sd_lock); - avl_destroy(&sdp->sd_snaps); - kmem_free(private, sizeof (zfsctl_snapdir_t)); - } -} - -static const fs_operation_def_t zfsctl_tops_snapdir[] = { - { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, - { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, - { VOPNAME_IOCTL, { .error = fs_inval } }, - { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, - { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, - { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, - { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, - { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, - { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, - { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, - { VOPNAME_SEEK, { .vop_seek = fs_seek } }, - { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, - { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, - { NULL } -}; - -/* - * pvp is the GFS vnode '.zfs/snapshot'. - * - * This creates a GFS node under '.zfs/snapshot' representing each - * snapshot. This newly created GFS node is what we mount snapshot - * vfs_t's ontop of. - */ -static vnode_t * -zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) -{ - vnode_t *vp; - zfsctl_node_t *zcp; - - vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, - zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); - zcp = vp->v_data; - zcp->zc_id = objset; - VFS_HOLD(vp->v_vfsp); - - return (vp); -} - -static void -zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - zfsctl_snapdir_t *sdp; - zfs_snapentry_t *sep, *next; - vnode_t *dvp; - - VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - - if (vp->v_count > 1) { - mutex_exit(&sdp->sd_lock); - return; - } - ASSERT(!vn_ismntpt(vp)); - - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - next = AVL_NEXT(&sdp->sd_snaps, sep); - - if (sep->se_root == vp) { - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - break; - } - sep = next; - } - ASSERT(sep != NULL); - - mutex_exit(&sdp->sd_lock); - VN_RELE(dvp); - VFS_RELE(vp->v_vfsp); - - /* - * Dispose of the vnode for the snapshot mount point. - * This is safe to do because once this entry has been removed - * from the AVL tree, it can't be found again, so cannot become - * "active". If we lookup the same name again we will end up - * creating a new vnode. - */ - gfs_vop_inactive(vp, cr, ct); -} - - -/* - * These VP's should never see the light of day. They should always - * be covered. - */ -static const fs_operation_def_t zfsctl_tops_snapshot[] = { - VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapshot_inactive }, - NULL, NULL -}; - -int -zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *dvp, *vp; - zfsctl_snapdir_t *sdp; - zfsctl_node_t *zcp; - zfs_snapentry_t *sep; - int error; - - ASSERT(zfsvfs->z_ctldir != NULL); - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, kcred, NULL, NULL, NULL); - if (error != 0) - return (error); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - vp = sep->se_root; - zcp = vp->v_data; - if (zcp->zc_id == objsetid) - break; - - sep = AVL_NEXT(&sdp->sd_snaps, sep); - } - - if (sep != NULL) { - VN_HOLD(vp); - /* - * Return the mounted root rather than the covered mount point. - * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid> - * and returns the ZFS vnode mounted on top of the GFS node. - * This ZFS vnode is the root of the vfs for objset 'objsetid'. - */ - error = traverse(&vp); - if (error == 0) { - if (vp == sep->se_root) - error = EINVAL; - else - *zfsvfsp = VTOZ(vp)->z_zfsvfs; - } - mutex_exit(&sdp->sd_lock); - VN_RELE(vp); - } else { - error = EINVAL; - mutex_exit(&sdp->sd_lock); - } - - VN_RELE(dvp); - - return (error); -} - -/* - * Unmount any snapshots for the given filesystem. This is called from - * zfs_umount() - if we have a ctldir, then go through and unmount all the - * snapshots. - */ -int -zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *dvp; - zfsctl_snapdir_t *sdp; - zfs_snapentry_t *sep, *next; - int error; - - ASSERT(zfsvfs->z_ctldir != NULL); - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, cr, NULL, NULL, NULL); - if (error != 0) - return (error); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - next = AVL_NEXT(&sdp->sd_snaps, sep); - - /* - * If this snapshot is not mounted, then it must - * have just been unmounted by somebody else, and - * will be cleaned up by zfsctl_snapdir_inactive(). - */ - if (vn_ismntpt(sep->se_root)) { - avl_remove(&sdp->sd_snaps, sep); - error = zfsctl_unmount_snap(sep, fflags, cr); - if (error) { - avl_add(&sdp->sd_snaps, sep); - break; - } - } - sep = next; - } - - mutex_exit(&sdp->sd_lock); - VN_RELE(dvp); - - return (error); -} diff --git a/zfs/lib/libdmu-ctl/zfs_dir.c b/zfs/lib/libdmu-ctl/zfs_dir.c deleted file mode 100644 index 6f22e2ad1..000000000 --- a/zfs/lib/libdmu-ctl/zfs_dir.c +++ /dev/null @@ -1,968 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_dir.c 1.25 08/04/27 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/mode.h> -#include <sys/kmem.h> -#include <sys/uio.h> -#include <sys/pathname.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/stat.h> -#include <sys/unistd.h> -#include <sys/sunddi.h> -#include <sys/random.h> -#include <sys/policy.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/fs/zfs.h> -#include "fs/fs_subr.h" -#include <sys/zap.h> -#include <sys/dmu.h> -#include <sys/atomic.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_fuid.h> -#include <sys/dnlc.h> -#include <sys/extdirent.h> - -/* - * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups - * of names after deciding which is the appropriate lookup interface. - */ -static int -zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, - boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) -{ - int error; - - if (zfsvfs->z_norm) { - matchtype_t mt = MT_FIRST; - boolean_t conflict = B_FALSE; - size_t bufsz = 0; - char *buf = NULL; - - if (rpnp) { - buf = rpnp->pn_buf; - bufsz = rpnp->pn_bufsize; - } - if (exact) - mt = MT_EXACT; - /* - * In the non-mixed case we only expect there would ever - * be one match, but we need to use the normalizing lookup. - */ - error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, - zoid, mt, buf, bufsz, &conflict); - if (!error && deflags) - *deflags = conflict ? ED_CASE_CONFLICT : 0; - } else { - error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); - } - *zoid = ZFS_DIRENT_OBJ(*zoid); - - if (error == ENOENT && update) - dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); - - return (error); -} - -/* - * Lock a directory entry. A dirlock on <dzp, name> protects that name - * in dzp's directory zap object. As long as you hold a dirlock, you can - * assume two things: (1) dzp cannot be reaped, and (2) no other thread - * can change the zap entry for (i.e. link or unlink) this name. - * - * Input arguments: - * dzp - znode for directory - * name - name of entry to lock - * flag - ZNEW: if the entry already exists, fail with EEXIST. - * ZEXISTS: if the entry does not exist, fail with ENOENT. - * ZSHARED: allow concurrent access with other ZSHARED callers. - * ZXATTR: we want dzp's xattr directory - * ZCILOOK: On a mixed sensitivity file system, - * this lookup should be case-insensitive. - * ZCIEXACT: On a purely case-insensitive file system, - * this lookup should be case-sensitive. - * ZRENAMING: we are locking for renaming, force narrow locks - * - * Output arguments: - * zpp - pointer to the znode for the entry (NULL if there isn't one) - * dlpp - pointer to the dirlock for this entry (NULL on error) - * direntflags - (case-insensitive lookup only) - * flags if multiple case-sensitive matches exist in directory - * realpnp - (case-insensitive lookup only) - * actual name matched within the directory - * - * Return value: 0 on success or errno on failure. - * - * NOTE: Always checks for, and rejects, '.' and '..'. - * NOTE: For case-insensitive file systems we take wide locks (see below), - * but return znode pointers to a single match. - */ -int -zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag, int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t *dl; - boolean_t update; - boolean_t exact; - uint64_t zoid; - vnode_t *vp = NULL; - int error = 0; - int cmpflags; - - *zpp = NULL; - *dlpp = NULL; - - /* - * Verify that we are not trying to lock '.', '..', or '.zfs' - */ - if (name[0] == '.' && - (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || - zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) - return (EEXIST); - - /* - * Case sensitivity and normalization preferences are set when - * the file system is created. These are stored in the - * zfsvfs->z_case and zfsvfs->z_norm fields. These choices - * affect what vnodes can be cached in the DNLC, how we - * perform zap lookups, and the "width" of our dirlocks. - * - * A normal dirlock locks a single name. Note that with - * normalization a name can be composed multiple ways, but - * when normalized, these names all compare equal. A wide - * dirlock locks multiple names. We need these when the file - * system is supporting mixed-mode access. It is sometimes - * necessary to lock all case permutations of file name at - * once so that simultaneous case-insensitive/case-sensitive - * behaves as rationally as possible. - */ - - /* - * Decide if exact matches should be requested when performing - * a zap lookup on file systems supporting case-insensitive - * access. - */ - exact = - ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); - - /* - * Only look in or update the DNLC if we are looking for the - * name on a file system that does not require normalization - * or case folding. We can also look there if we happen to be - * on a non-normalizing, mixed sensitivity file system IF we - * are looking for the exact name. - * - * Maybe can add TO-UPPERed version of name to dnlc in ci-only - * case for performance improvement? - */ - update = !zfsvfs->z_norm || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && - !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); - - /* - * ZRENAMING indicates we are in a situation where we should - * take narrow locks regardless of the file system's - * preferences for normalizing and case folding. This will - * prevent us deadlocking trying to grab the same wide lock - * twice if the two names happen to be case-insensitive - * matches. - */ - if (flag & ZRENAMING) - cmpflags = 0; - else - cmpflags = zfsvfs->z_norm; - - /* - * Wait until there are no locks on this name. - */ - rw_enter(&dzp->z_name_lock, RW_READER); - mutex_enter(&dzp->z_lock); - for (;;) { - if (dzp->z_unlinked) { - mutex_exit(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); - return (ENOENT); - } - for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { - if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, - U8_UNICODE_LATEST, &error) == 0) || error != 0) - break; - } - if (error != 0) { - mutex_exit(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); - return (ENOENT); - } - if (dl == NULL) { - /* - * Allocate a new dirlock and add it to the list. - */ - dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); - cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); - dl->dl_name = name; - dl->dl_sharecnt = 0; - dl->dl_namesize = 0; - dl->dl_dzp = dzp; - dl->dl_next = dzp->z_dirlocks; - dzp->z_dirlocks = dl; - break; - } - if ((flag & ZSHARED) && dl->dl_sharecnt != 0) - break; - cv_wait(&dl->dl_cv, &dzp->z_lock); - } - - if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { - /* - * We're the second shared reference to dl. Make a copy of - * dl_name in case the first thread goes away before we do. - * Note that we initialize the new name before storing its - * pointer into dl_name, because the first thread may load - * dl->dl_name at any time. He'll either see the old value, - * which is his, or the new shared copy; either is OK. - */ - dl->dl_namesize = strlen(dl->dl_name) + 1; - name = kmem_alloc(dl->dl_namesize, KM_SLEEP); - bcopy(dl->dl_name, name, dl->dl_namesize); - dl->dl_name = name; - } - - mutex_exit(&dzp->z_lock); - - /* - * We have a dirlock on the name. (Note that it is the dirlock, - * not the dzp's z_lock, that protects the name in the zap object.) - * See if there's an object by this name; if so, put a hold on it. - */ - if (flag & ZXATTR) { - zoid = dzp->z_phys->zp_xattr; - error = (zoid == 0 ? ENOENT : 0); - } else { - if (update) - vp = dnlc_lookup(ZTOV(dzp), name); - if (vp == DNLC_NO_VNODE) { - VN_RELE(vp); - error = ENOENT; - } else if (vp) { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - return (EEXIST); - } - *dlpp = dl; - *zpp = VTOZ(vp); - return (0); - } else { - error = zfs_match_find(zfsvfs, dzp, name, exact, - update, direntflags, realpnp, &zoid); - } - } - if (error) { - if (error != ENOENT || (flag & ZEXISTS)) { - zfs_dirent_unlock(dl); - return (error); - } - } else { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - return (EEXIST); - } - error = zfs_zget(zfsvfs, zoid, zpp); - if (error) { - zfs_dirent_unlock(dl); - return (error); - } - if (!(flag & ZXATTR) && update) - dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); - } - - *dlpp = dl; - - return (0); -} - -/* - * Unlock this directory entry and wake anyone who was waiting for it. - */ -void -zfs_dirent_unlock(zfs_dirlock_t *dl) -{ - znode_t *dzp = dl->dl_dzp; - zfs_dirlock_t **prev_dl, *cur_dl; - - mutex_enter(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); - if (dl->dl_sharecnt > 1) { - dl->dl_sharecnt--; - mutex_exit(&dzp->z_lock); - return; - } - prev_dl = &dzp->z_dirlocks; - while ((cur_dl = *prev_dl) != dl) - prev_dl = &cur_dl->dl_next; - *prev_dl = dl->dl_next; - cv_broadcast(&dl->dl_cv); - mutex_exit(&dzp->z_lock); - - if (dl->dl_namesize != 0) - kmem_free(dl->dl_name, dl->dl_namesize); - cv_destroy(&dl->dl_cv); - kmem_free(dl, sizeof (*dl)); -} - -/* - * Look up an entry in a directory. - * - * NOTE: '.' and '..' are handled as special cases because - * no directory entries are actually stored for them. If this is - * the root of a filesystem, then '.zfs' is also treated as a - * special pseudo-directory. - */ -int -zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, - int *deflg, pathname_t *rpnp) -{ - zfs_dirlock_t *dl; - znode_t *zp; - int error = 0; - - if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - *vpp = ZTOV(dzp); - VN_HOLD(*vpp); - } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - /* - * If we are a snapshot mounted under .zfs, return - * the vp for the snapshot directory. - */ - if (dzp->z_phys->zp_parent == dzp->z_id && - zfsvfs->z_parent != zfsvfs) { - error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", vpp, NULL, 0, NULL, kcred, - NULL, NULL, NULL); - return (error); - } - rw_enter(&dzp->z_parent_lock, RW_READER); - error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); - if (error == 0) - *vpp = ZTOV(zp); - rw_exit(&dzp->z_parent_lock); - } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { - *vpp = zfsctl_root(dzp); - } else { - int zf; - - zf = ZEXISTS | ZSHARED; - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); - if (error == 0) { - *vpp = ZTOV(zp); - zfs_dirent_unlock(dl); - dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ - } - rpnp = NULL; - } - - if ((flags & FIGNORECASE) && rpnp && !error) - (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); - - return (error); -} - -static char * -zfs_unlinked_hexname(char namebuf[17], uint64_t x) -{ - char *name = &namebuf[16]; - const char digits[16] = "0123456789abcdef"; - - *name = '\0'; - do { - *--name = digits[x & 0xf]; - x >>= 4; - } while (x != 0); - - return (name); -} - -/* - * unlinked Set (formerly known as the "delete queue") Error Handling - * - * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we - * don't specify the name of the entry that we will be manipulating. We - * also fib and say that we won't be adding any new entries to the - * unlinked set, even though we might (this is to lower the minimum file - * size that can be deleted in a full filesystem). So on the small - * chance that the nlink list is using a fat zap (ie. has more than - * 2000 entries), we *may* not pre-read a block that's needed. - * Therefore it is remotely possible for some of the assertions - * regarding the unlinked set below to fail due to i/o error. On a - * nondebug system, this will result in the space being leaked. - */ -void -zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - char obj_name[17]; - int error; - - ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_phys->zp_links, ==, 0); - - error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); - ASSERT3U(error, ==, 0); -} - -/* - * Clean up any znodes that had no links when we either crashed or - * (force) umounted the file system. - */ -void -zfs_unlinked_drain(zfsvfs_t *zfsvfs) -{ - zap_cursor_t zc; - zap_attribute_t zap; - dmu_object_info_t doi; - znode_t *zp; - int error; - - /* - * Interate over the contents of the unlinked set. - */ - for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); - zap_cursor_retrieve(&zc, &zap) == 0; - zap_cursor_advance(&zc)) { - - /* - * See what kind of object we have in list - */ - - error = dmu_object_info(zfsvfs->z_os, - zap.za_first_integer, &doi); - if (error != 0) - continue; - - ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || - (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); - /* - * We need to re-mark these list entries for deletion, - * so we pull them back into core and set zp->z_unlinked. - */ - error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); - - /* - * We may pick up znodes that are already marked for deletion. - * This could happen during the purge of an extended attribute - * directory. All we need to do is skip over them, since they - * are already in the system marked z_unlinked. - */ - if (error != 0) - continue; - - zp->z_unlinked = B_TRUE; - VN_RELE(ZTOV(zp)); - } - zap_cursor_fini(&zc); -} - -/* - * Delete the entire contents of a directory. Return a count - * of the number of entries that could not be deleted. If we encounter - * an error, return a count of at least one so that the directory stays - * in the unlinked set. - * - * NOTE: this function assumes that the directory is inactive, - * so there is no need to lock its entries before deletion. - * Also, it assumes the directory contents is *only* regular - * files. - */ -static int -zfs_purgedir(znode_t *dzp) -{ - zap_cursor_t zc; - zap_attribute_t zap; - znode_t *xzp; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t dl; - int skipped = 0; - int error; - - for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); - (error = zap_cursor_retrieve(&zc, &zap)) == 0; - zap_cursor_advance(&zc)) { - error = zfs_zget(zfsvfs, - ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); - if (error) { - skipped += 1; - continue; - } - - ASSERT((ZTOV(xzp)->v_type == VREG) || - (ZTOV(xzp)->v_type == VLNK)); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_bonus(tx, xzp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - VN_RELE(ZTOV(xzp)); - skipped += 1; - continue; - } - bzero(&dl, sizeof (dl)); - dl.dl_dzp = dzp; - dl.dl_name = zap.za_name; - - error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); - if (error) - skipped += 1; - dmu_tx_commit(tx); - - VN_RELE(ZTOV(xzp)); - } - zap_cursor_fini(&zc); - if (error != ENOENT) - skipped += 1; - return (skipped); -} - -void -zfs_rmnode(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; - znode_t *xzp = NULL; - char obj_name[17]; - dmu_tx_t *tx; - uint64_t acl_obj; - int error; - - ASSERT(ZTOV(zp)->v_count == 0); - ASSERT(zp->z_phys->zp_links == 0); - - /* - * If this is an attribute directory, purge its contents. - */ - if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { - if (zfs_purgedir(zp) != 0) { - /* - * Not enough space to delete some xattrs. - * Leave it on the unlinked set. - */ - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - } - - /* - * If the file has extended attributes, we're going to unlink - * the xattr dir. - */ - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT(error == 0); - } - - acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; - - /* - * Set up the transaction. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - if (xzp) { - dmu_tx_hold_bonus(tx, xzp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); - } - if (acl_obj) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - /* - * Not enough space to delete the file. Leave it in the - * unlinked set, leaking it until the fs is remounted (at - * which point we'll call zfs_unlinked_drain() to process it). - */ - dmu_tx_abort(tx); - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - goto out; - } - - if (xzp) { - dmu_buf_will_dirty(xzp->z_dbuf, tx); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - xzp->z_phys->zp_links = 0; /* no more links to it */ - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - } - - /* Remove this znode from the unlinked set */ - error = zap_remove(os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), tx); - ASSERT3U(error, ==, 0); - - zfs_znode_delete(zp, tx); - - dmu_tx_commit(tx); -out: - if (xzp) - VN_RELE(ZTOV(xzp)); -} - -static uint64_t -zfs_dirent(znode_t *zp) -{ - uint64_t de = zp->z_id; - if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) - de |= IFTODT((zp)->z_phys->zp_mode) << 60; - return (de); -} - -/* - * Link zp into dl. Can only fail if zp has been unlinked. - */ -int -zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) -{ - znode_t *dzp = dl->dl_dzp; - vnode_t *vp = ZTOV(zp); - uint64_t value; - int zp_is_dir = (vp->v_type == VDIR); - int error; - - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - - if (!(flag & ZRENAMING)) { - if (zp->z_unlinked) { /* no new links to unlinked zp */ - ASSERT(!(flag & (ZNEW | ZEXISTS))); - mutex_exit(&zp->z_lock); - return (ENOENT); - } - zp->z_phys->zp_links++; - } - zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ - - if (!(flag & ZNEW)) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - mutex_exit(&zp->z_lock); - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size++; /* one dirent added */ - dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); - - value = zfs_dirent(zp); - error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, - 8, 1, &value, tx); - ASSERT(error == 0); - - dnlc_update(ZTOV(dzp), dl->dl_name, vp); - - return (0); -} - -/* - * Unlink zp from dl, and mark zp for deletion if this was the last link. - * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). - * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. - * If it's non-NULL, we use it to indicate whether the znode needs deletion, - * and it's the caller's job to do it. - */ -int -zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) -{ - znode_t *dzp = dl->dl_dzp; - vnode_t *vp = ZTOV(zp); - int zp_is_dir = (vp->v_type == VDIR); - boolean_t unlinked = B_FALSE; - int error; - - dnlc_remove(ZTOV(dzp), dl->dl_name); - - if (!(flag & ZRENAMING)) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - - if (vn_vfswlock(vp)) /* prevent new mounts on zp */ - return (EBUSY); - - if (vn_ismntpt(vp)) { /* don't remove mount point */ - vn_vfsunlock(vp); - return (EBUSY); - } - - mutex_enter(&zp->z_lock); - if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); - return (EEXIST); - } - if (zp->z_phys->zp_links <= zp_is_dir) { - zfs_panic_recover("zfs: link count on %s is %u, " - "should be at least %u", - zp->z_vnode->v_path ? zp->z_vnode->v_path : - "<unknown>", (int)zp->z_phys->zp_links, - zp_is_dir + 1); - zp->z_phys->zp_links = zp_is_dir + 1; - } - if (--zp->z_phys->zp_links == zp_is_dir) { - zp->z_unlinked = B_TRUE; - zp->z_phys->zp_links = 0; - unlinked = B_TRUE; - } else { - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - } - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); - } - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size--; /* one dirent removed */ - dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); - - if (zp->z_zfsvfs->z_norm) { - if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && - (flag & ZCIEXACT)) || - ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && - !(flag & ZCILOOK))) - error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_EXACT, tx); - else - error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_FIRST, tx); - } else { - error = zap_remove(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, tx); - } - ASSERT(error == 0); - - if (unlinkedp != NULL) - *unlinkedp = unlinked; - else if (unlinked) - zfs_unlinked_add(zp, tx); - - return (0); -} - -/* - * Indicate whether the directory is empty. Works with or without z_lock - * held, but can only be consider a hint in the latter case. Returns true - * if only "." and ".." remain and there's no work in progress. - */ -boolean_t -zfs_dirempty(znode_t *dzp) -{ - return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); -} - -int -zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_t *xzp; - dmu_tx_t *tx; - int error; - zfs_fuid_info_t *fuidp = NULL; - - *xvpp = NULL; - - if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) - return (error); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) - dmu_tx_wait(tx); - dmu_tx_abort(tx); - return (error); - } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); - ASSERT(xzp->z_phys->zp_parent == zp->z_id); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_phys->zp_xattr = xzp->z_id; - - (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); - dmu_tx_commit(tx); - - *xvpp = ZTOV(xzp); - - return (0); -} - -/* - * Return a znode for the extended attribute directory for zp. - * ** If the directory does not already exist, it is created ** - * - * IN: zp - znode to obtain attribute directory from - * cr - credentials of caller - * flags - flags from the VOP_LOOKUP call - * - * OUT: xzpp - pointer to extended attribute znode - * - * RETURN: 0 on success - * error number on failure - */ -int -zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_t *xzp; - zfs_dirlock_t *dl; - vattr_t va; - int error; -top: - error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); - if (error) - return (error); - - if (xzp != NULL) { - *xvpp = ZTOV(xzp); - zfs_dirent_unlock(dl); - return (0); - } - - ASSERT(zp->z_phys->zp_xattr == 0); - - if (!(flags & CREATE_XATTR_DIR)) { - zfs_dirent_unlock(dl); - return (ENOENT); - } - - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - zfs_dirent_unlock(dl); - return (EROFS); - } - - /* - * The ability to 'create' files in an attribute - * directory comes from the write_xattr permission on the base file. - * - * The ability to 'search' an attribute directory requires - * read_xattr permission on the base file. - * - * Once in a directory the ability to read/write attributes - * is controlled by the permissions on the attribute file. - */ - va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; - va.va_type = VDIR; - va.va_mode = S_IFDIR | S_ISVTX | 0777; - zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); - - error = zfs_make_xattrdir(zp, &va, xvpp, cr); - zfs_dirent_unlock(dl); - - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - /* NB: we already did dmu_tx_wait() if necessary */ - goto top; - } - - return (error); -} - -/* - * Decide whether it is okay to remove within a sticky directory. - * - * In sticky directories, write access is not sufficient; - * you can remove entries from a directory only if: - * - * you own the directory, - * you own the entry, - * the entry is a plain file and you have write access, - * or you are privileged (checked in secpolicy...). - * - * The function returns 0 if remove access is granted. - */ -int -zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) -{ - uid_t uid; - uid_t downer; - uid_t fowner; - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - - if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ - return (0); - - if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) - return (0); - - downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); - fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); - - if ((uid = crgetuid(cr)) == downer || uid == fowner || - (ZTOV(zp)->v_type == VREG && - zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) - return (0); - else - return (secpolicy_vnode_remove(cr)); -} diff --git a/zfs/lib/libdmu-ctl/zfs_fuid.c b/zfs/lib/libdmu-ctl/zfs_fuid.c deleted file mode 100644 index 59c9adfe2..000000000 --- a/zfs/lib/libdmu-ctl/zfs_fuid.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_fuid.c 1.5 08/01/31 SMI" - -#include <sys/zfs_context.h> -#include <sys/sunddi.h> -#include <sys/dmu.h> -#include <sys/avl.h> -#include <sys/zap.h> -#include <sys/refcount.h> -#include <sys/nvpair.h> -#ifdef _KERNEL -#include <sys/kidmap.h> -#include <sys/sid.h> -#include <sys/zfs_vfsops.h> -#include <sys/zfs_znode.h> -#endif -#include <sys/zfs_fuid.h> - -/* - * FUID Domain table(s). - * - * The FUID table is stored as a packed nvlist of an array - * of nvlists which contain an index, domain string and offset - * - * During file system initialization the nvlist(s) are read and - * two AVL trees are created. One tree is keyed by the index number - * and the other by the domain string. Nodes are never removed from - * trees, but new entries may be added. If a new entry is added then the - * on-disk packed nvlist will also be updated. - */ - -#define FUID_IDX "fuid_idx" -#define FUID_DOMAIN "fuid_domain" -#define FUID_OFFSET "fuid_offset" -#define FUID_NVP_ARRAY "fuid_nvlist" - -typedef struct fuid_domain { - avl_node_t f_domnode; - avl_node_t f_idxnode; - ksiddomain_t *f_ksid; - uint64_t f_idx; -} fuid_domain_t; - -/* - * Compare two indexes. - */ -static int -idx_compare(const void *arg1, const void *arg2) -{ - const fuid_domain_t *node1 = arg1; - const fuid_domain_t *node2 = arg2; - - if (node1->f_idx < node2->f_idx) - return (-1); - else if (node1->f_idx > node2->f_idx) - return (1); - return (0); -} - -/* - * Compare two domain strings. - */ -static int -domain_compare(const void *arg1, const void *arg2) -{ - const fuid_domain_t *node1 = arg1; - const fuid_domain_t *node2 = arg2; - int val; - - val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - if (val == 0) - return (0); - return (val > 0 ? 1 : -1); -} - -/* - * load initial fuid domain and idx trees. This function is used by - * both the kernel and zdb. - */ -uint64_t -zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, - avl_tree_t *domain_tree) -{ - dmu_buf_t *db; - uint64_t fuid_size; - - avl_create(idx_tree, idx_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); - avl_create(domain_tree, domain_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); - - VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); - fuid_size = *(uint64_t *)db->db_data; - dmu_buf_rele(db, FTAG); - - if (fuid_size) { - nvlist_t **fuidnvp; - nvlist_t *nvp = NULL; - uint_t count; - char *packed; - int i; - - packed = kmem_alloc(fuid_size, KM_SLEEP); - VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); - VERIFY(nvlist_unpack(packed, fuid_size, - &nvp, 0) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, - &fuidnvp, &count) == 0); - - for (i = 0; i != count; i++) { - fuid_domain_t *domnode; - char *domain; - uint64_t idx; - - VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, - &domain) == 0); - VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, - &idx) == 0); - - domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); - - domnode->f_idx = idx; - domnode->f_ksid = ksid_lookupdomain(domain); - avl_add(idx_tree, domnode); - avl_add(domain_tree, domnode); - } - nvlist_free(nvp); - kmem_free(packed, fuid_size); - } - return (fuid_size); -} - -void -zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) -{ - fuid_domain_t *domnode; - void *cookie; - - cookie = NULL; - while (domnode = avl_destroy_nodes(domain_tree, &cookie)) - ksiddomain_rele(domnode->f_ksid); - - avl_destroy(domain_tree); - cookie = NULL; - while (domnode = avl_destroy_nodes(idx_tree, &cookie)) - kmem_free(domnode, sizeof (fuid_domain_t)); - avl_destroy(idx_tree); -} - -char * -zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) -{ - fuid_domain_t searchnode, *findnode; - avl_index_t loc; - - searchnode.f_idx = idx; - - findnode = avl_find(idx_tree, &searchnode, &loc); - - return (findnode->f_ksid->kd_name); -} - -#ifdef _KERNEL -/* - * Load the fuid table(s) into memory. - */ -static void -zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) -{ - int error = 0; - - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); - - if (zfsvfs->z_fuid_loaded) { - rw_exit(&zfsvfs->z_fuid_lock); - return; - } - - if (zfsvfs->z_fuid_obj == 0) { - - /* first make sure we need to allocate object */ - - error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); - if (error == ENOENT && tx != NULL) { - zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, - sizeof (uint64_t), tx); - VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, sizeof (uint64_t), 1, - &zfsvfs->z_fuid_obj, tx) == 0); - } - } - - zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, - zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - - zfsvfs->z_fuid_loaded = B_TRUE; - rw_exit(&zfsvfs->z_fuid_lock); -} - -/* - * Query domain table for a given domain. - * - * If domain isn't found it is added to AVL trees and - * the results are pushed out to disk. - */ -int -zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, - dmu_tx_t *tx) -{ - fuid_domain_t searchnode, *findnode; - avl_index_t loc; - - /* - * If the dummy "nobody" domain then return an index of 0 - * to cause the created FUID to be a standard POSIX id - * for the user nobody. - */ - if (domain[0] == '\0') { - *retdomain = ""; - return (0); - } - - searchnode.f_ksid = ksid_lookupdomain(domain); - if (retdomain) { - *retdomain = searchnode.f_ksid->kd_name; - } - if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, tx); - - rw_enter(&zfsvfs->z_fuid_lock, RW_READER); - findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); - rw_exit(&zfsvfs->z_fuid_lock); - - if (findnode) { - ksiddomain_rele(searchnode.f_ksid); - return (findnode->f_idx); - } else { - fuid_domain_t *domnode; - nvlist_t *nvp; - nvlist_t **fuids; - uint64_t retidx; - size_t nvsize = 0; - char *packed; - dmu_buf_t *db; - int i = 0; - - domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); - domnode->f_ksid = searchnode.f_ksid; - - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); - retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; - - avl_add(&zfsvfs->z_fuid_domain, domnode); - avl_add(&zfsvfs->z_fuid_idx, domnode); - /* - * Now resync the on-disk nvlist. - */ - VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - domnode = avl_first(&zfsvfs->z_fuid_domain); - fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); - while (domnode) { - VERIFY(nvlist_alloc(&fuids[i], - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, - domnode->f_idx) == 0); - VERIFY(nvlist_add_uint64(fuids[i], - FUID_OFFSET, 0) == 0); - VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, - domnode->f_ksid->kd_name) == 0); - domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); - } - VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, - fuids, retidx) == 0); - for (i = 0; i != retidx; i++) - nvlist_free(fuids[i]); - kmem_free(fuids, retidx * sizeof (void *)); - VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); - packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(nvp, &packed, &nvsize, - NV_ENCODE_XDR, KM_SLEEP) == 0); - nvlist_free(nvp); - zfsvfs->z_fuid_size = nvsize; - dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx); - kmem_free(packed, zfsvfs->z_fuid_size); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, - FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; - dmu_buf_rele(db, FTAG); - - rw_exit(&zfsvfs->z_fuid_lock); - return (retidx); - } -} - -/* - * Query domain table by index, returning domain string - * - * Returns a pointer from an avl node of the domain string. - * - */ -static char * -zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) -{ - char *domain; - - if (idx == 0 || !zfsvfs->z_use_fuids) - return (NULL); - - if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, NULL); - - rw_enter(&zfsvfs->z_fuid_lock, RW_READER); - domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); - rw_exit(&zfsvfs->z_fuid_lock); - - ASSERT(domain); - return (domain); -} - -void -zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) -{ - *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, - cr, ZFS_OWNER); - *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, - cr, ZFS_GROUP); -} - -uid_t -zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, - cred_t *cr, zfs_fuid_type_t type) -{ - uint32_t index = FUID_INDEX(fuid); - char *domain; - uid_t id; - - if (index == 0) - return (fuid); - - domain = zfs_fuid_find_by_idx(zfsvfs, index); - ASSERT(domain != NULL); - - if (type == ZFS_OWNER || type == ZFS_ACE_USER) { - (void) kidmap_getuidbysid(crgetzone(cr), domain, - FUID_RID(fuid), &id); - } else { - (void) kidmap_getgidbysid(crgetzone(cr), domain, - FUID_RID(fuid), &id); - } - return (id); -} - -/* - * Add a FUID node to the list of fuid's being created for this - * ACL - * - * If ACL has multiple domains, then keep only one copy of each unique - * domain. - */ -static void -zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, - uint64_t idx, uint64_t id, zfs_fuid_type_t type) -{ - zfs_fuid_t *fuid; - zfs_fuid_domain_t *fuid_domain; - zfs_fuid_info_t *fuidp; - uint64_t fuididx; - boolean_t found = B_FALSE; - - if (*fuidpp == NULL) - *fuidpp = zfs_fuid_info_alloc(); - - fuidp = *fuidpp; - /* - * First find fuid domain index in linked list - * - * If one isn't found then create an entry. - */ - - for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); - fuid_domain; fuid_domain = list_next(&fuidp->z_domains, - fuid_domain), fuididx++) { - if (idx == fuid_domain->z_domidx) { - found = B_TRUE; - break; - } - } - - if (!found) { - fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); - fuid_domain->z_domain = domain; - fuid_domain->z_domidx = idx; - list_insert_tail(&fuidp->z_domains, fuid_domain); - fuidp->z_domain_str_sz += strlen(domain) + 1; - fuidp->z_domain_cnt++; - } - - if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { - /* - * Now allocate fuid entry and add it on the end of the list - */ - - fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); - fuid->z_id = id; - fuid->z_domidx = idx; - fuid->z_logfuid = FUID_ENCODE(fuididx, rid); - - list_insert_tail(&fuidp->z_fuids, fuid); - fuidp->z_fuid_cnt++; - } else { - if (type == ZFS_OWNER) - fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); - else - fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); - } -} - -/* - * Create a file system FUID, based on information in the users cred - */ -uint64_t -zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, - dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) -{ - uint64_t idx; - ksid_t *ksid; - uint32_t rid; - char *kdomain; - const char *domain; - uid_t id; - - VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); - - if (type == ZFS_OWNER) - id = crgetuid(cr); - else - id = crgetgid(cr); - - if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id)) - return ((uint64_t)id); - - ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); - - VERIFY(ksid != NULL); - rid = ksid_getrid(ksid); - domain = ksid_getdomain(ksid); - - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); - - zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); - - return (FUID_ENCODE(idx, rid)); -} - -/* - * Create a file system FUID for an ACL ace - * or a chown/chgrp of the file. - * This is similar to zfs_fuid_create_cred, except that - * we can't find the domain + rid information in the - * cred. Instead we have to query Winchester for the - * domain and rid. - * - * During replay operations the domain+rid information is - * found in the zfs_fuid_info_t that the replay code has - * attached to the zfsvfs of the file system. - */ -uint64_t -zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, - zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) -{ - const char *domain; - char *kdomain; - uint32_t fuid_idx = FUID_INDEX(id); - uint32_t rid; - idmap_stat status; - uint64_t idx; - boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); - zfs_fuid_t *zfuid = NULL; - zfs_fuid_info_t *fuidp; - - /* - * If POSIX ID, or entry is already a FUID then - * just return the id - * - * We may also be handed an already FUID'ized id via - * chmod. - */ - - if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) - return (id); - - if (is_replay) { - fuidp = zfsvfs->z_fuid_replay; - - /* - * If we are passed an ephemeral id, but no - * fuid_info was logged then return NOBODY. - * This is most likely a result of idmap service - * not being available. - */ - if (fuidp == NULL) - return (UID_NOBODY); - - switch (type) { - case ZFS_ACE_USER: - case ZFS_ACE_GROUP: - zfuid = list_head(&fuidp->z_fuids); - rid = FUID_RID(zfuid->z_logfuid); - idx = FUID_INDEX(zfuid->z_logfuid); - break; - case ZFS_OWNER: - rid = FUID_RID(fuidp->z_fuid_owner); - idx = FUID_INDEX(fuidp->z_fuid_owner); - break; - case ZFS_GROUP: - rid = FUID_RID(fuidp->z_fuid_group); - idx = FUID_INDEX(fuidp->z_fuid_group); - break; - }; - domain = fuidp->z_domain_table[idx -1]; - } else { - if (type == ZFS_OWNER || type == ZFS_ACE_USER) - status = kidmap_getsidbyuid(crgetzone(cr), id, - &domain, &rid); - else - status = kidmap_getsidbygid(crgetzone(cr), id, - &domain, &rid); - - if (status != 0) { - /* - * When returning nobody we will need to - * make a dummy fuid table entry for logging - * purposes. - */ - rid = UID_NOBODY; - domain = ""; - } - } - - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); - - if (!is_replay) - zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); - else if (zfuid != NULL) { - list_remove(&fuidp->z_fuids, zfuid); - kmem_free(zfuid, sizeof (zfs_fuid_t)); - } - return (FUID_ENCODE(idx, rid)); -} - -void -zfs_fuid_destroy(zfsvfs_t *zfsvfs) -{ - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); - if (!zfsvfs->z_fuid_loaded) { - rw_exit(&zfsvfs->z_fuid_lock); - return; - } - zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - rw_exit(&zfsvfs->z_fuid_lock); -} - -/* - * Allocate zfs_fuid_info for tracking FUIDs created during - * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() - */ -zfs_fuid_info_t * -zfs_fuid_info_alloc(void) -{ - zfs_fuid_info_t *fuidp; - - fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); - list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), - offsetof(zfs_fuid_domain_t, z_next)); - list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), - offsetof(zfs_fuid_t, z_next)); - return (fuidp); -} - -/* - * Release all memory associated with zfs_fuid_info_t - */ -void -zfs_fuid_info_free(zfs_fuid_info_t *fuidp) -{ - zfs_fuid_t *zfuid; - zfs_fuid_domain_t *zdomain; - - while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { - list_remove(&fuidp->z_fuids, zfuid); - kmem_free(zfuid, sizeof (zfs_fuid_t)); - } - - if (fuidp->z_domain_table != NULL) - kmem_free(fuidp->z_domain_table, - (sizeof (char **)) * fuidp->z_domain_cnt); - - while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { - list_remove(&fuidp->z_domains, zdomain); - kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); - } - - kmem_free(fuidp, sizeof (zfs_fuid_info_t)); -} - -/* - * Check to see if id is a groupmember. If cred - * has ksid info then sidlist is checked first - * and if still not found then POSIX groups are checked - * - * Will use a straight FUID compare when possible. - */ -boolean_t -zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) -{ - ksid_t *ksid = crgetsid(cr, KSID_GROUP); - uid_t gid; - - if (ksid) { - int i; - ksid_t *ksid_groups; - ksidlist_t *ksidlist = crgetsidlist(cr); - uint32_t idx = FUID_INDEX(id); - uint32_t rid = FUID_RID(id); - - ASSERT(ksidlist); - ksid_groups = ksidlist->ksl_sids; - - for (i = 0; i != ksidlist->ksl_nsid; i++) { - if (idx == 0) { - if (id != IDMAP_WK_CREATOR_GROUP_GID && - id == ksid_groups[i].ks_id) { - return (B_TRUE); - } - } else { - char *domain; - - domain = zfs_fuid_find_by_idx(zfsvfs, idx); - ASSERT(domain != NULL); - - if (strcmp(domain, - IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) - return (B_FALSE); - - if ((strcmp(domain, - ksid_groups[i].ks_domain->kd_name) == 0) && - rid == ksid_groups[i].ks_rid) - return (B_TRUE); - } - } - } - - /* - * Not found in ksidlist, check posix groups - */ - gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); - return (groupmember(gid, cr)); -} -#endif diff --git a/zfs/lib/libdmu-ctl/zfs_ioctl.c b/zfs/lib/libdmu-ctl/zfs_ioctl.c deleted file mode 100644 index e4d253474..000000000 --- a/zfs/lib/libdmu-ctl/zfs_ioctl.c +++ /dev/null @@ -1,3055 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_ioctl.c 1.61 08/04/27 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/errno.h> -#include <sys/uio.h> -#include <sys/buf.h> -#include <sys/modctl.h> -#include <sys/open.h> -#include <sys/file.h> -#include <sys/kmem.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/stat.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_znode.h> -#include <sys/zap.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/vdev.h> -#include <sys/vdev_impl.h> -#include <sys/dmu.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_deleg.h> -#include <sys/dmu_objset.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/sunldi.h> -#include <sys/policy.h> -#include <sys/zone.h> -#include <sys/nvpair.h> -#include <sys/pathname.h> -#include <sys/mount.h> -#include <sys/sdt.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_dir.h> -#include <sys/zvol.h> -#include <sharefs/share.h> -#include <sys/dmu_objset.h> - -#include "zfs_namecheck.h" -#include "zfs_prop.h" -#include "zfs_deleg.h" - -extern struct modlfs zfs_modlfs; - -extern void zfs_init(void); -extern void zfs_fini(void); - -ldi_ident_t zfs_li = NULL; -dev_info_t *zfs_dip; - -typedef int zfs_ioc_func_t(zfs_cmd_t *); -typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); - -typedef struct zfs_ioc_vec { - zfs_ioc_func_t *zvec_func; - zfs_secpolicy_func_t *zvec_secpolicy; - enum { - NO_NAME, - POOL_NAME, - DATASET_NAME - } zvec_namecheck; - boolean_t zvec_his_log; -} zfs_ioc_vec_t; - -/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ -void -__dprintf(const char *file, const char *func, int line, const char *fmt, ...) -{ - const char *newfile; - char buf[256]; - va_list adx; - - /* - * Get rid of annoying "../common/" prefix to filename. - */ - newfile = strrchr(file, '/'); - if (newfile != NULL) { - newfile = newfile + 1; /* Get rid of leading / */ - } else { - newfile = file; - } - - va_start(adx, fmt); - (void) vsnprintf(buf, sizeof (buf), fmt, adx); - va_end(adx); - - /* - * To get this data, use the zfs-dprintf probe as so: - * dtrace -q -n 'zfs-dprintf \ - * /stringof(arg0) == "dbuf.c"/ \ - * {printf("%s: %s", stringof(arg1), stringof(arg3))}' - * arg0 = file name - * arg1 = function name - * arg2 = line number - * arg3 = message - */ - DTRACE_PROBE4(zfs__dprintf, - char *, newfile, char *, func, int, line, char *, buf); -} - -static void -history_str_free(char *buf) -{ - kmem_free(buf, HIS_MAX_RECORD_LEN); -} - -static char * -history_str_get(zfs_cmd_t *zc) -{ - char *buf; - - if (zc->zc_history == NULL) - return (NULL); - - buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); - if (copyinstr((void *)(uintptr_t)zc->zc_history, - buf, HIS_MAX_RECORD_LEN, NULL) != 0) { - history_str_free(buf); - return (NULL); - } - - buf[HIS_MAX_RECORD_LEN -1] = '\0'; - - return (buf); -} - -/* - * zfs_check_version - * - * Return non-zero if the spa version is less than requested version. - */ -static int -zfs_check_version(const char *name, int version) -{ - - spa_t *spa; - - if (spa_open(name, &spa, FTAG) == 0) { - if (spa_version(spa) < version) { - spa_close(spa, FTAG); - return (1); - } - spa_close(spa, FTAG); - } - return (0); -} - -/* - * zpl_check_version - * - * Return non-zero if the ZPL version is less than requested version. - */ -static int -zpl_check_version(const char *name, int version) -{ - objset_t *os; - int rc = 1; - - if (dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { - uint64_t propversion; - - if (zfs_get_zplprop(os, ZFS_PROP_VERSION, - &propversion) == 0) { - rc = !(propversion >= version); - } - dmu_objset_close(os); - } - return (rc); -} - -static void -zfs_log_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *buf; - - if ((buf = history_str_get(zc)) == NULL) - return; - - if (spa_open(zc->zc_name, &spa, FTAG) == 0) { - if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) - (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); - spa_close(spa, FTAG); - } - history_str_free(buf); -} - -/* - * Policy for top-level read operations (list pools). Requires no privileges, - * and can be used in the local zone, as there is no associated dataset. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) -{ - return (0); -} - -/* - * Policy for dataset read operations (list children, get statistics). Requires - * no privileges, but must be visible in the local zone. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) -{ - if (INGLOBALZONE(curproc) || - zone_dataset_visible(zc->zc_name, NULL)) - return (0); - - return (ENOENT); -} - -static int -zfs_dozonecheck(const char *dataset, cred_t *cr) -{ - uint64_t zoned; - int writable = 1; - - /* - * The dataset must be visible by this zone -- check this first - * so they don't see EPERM on something they shouldn't know about. - */ - if (!INGLOBALZONE(curproc) && - !zone_dataset_visible(dataset, &writable)) - return (ENOENT); - - if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) - return (ENOENT); - - if (INGLOBALZONE(curproc)) { - /* - * If the fs is zoned, only root can access it from the - * global zone. - */ - if (secpolicy_zfs(cr) && zoned) - return (EPERM); - } else { - /* - * If we are in a local zone, the 'zoned' property must be set. - */ - if (!zoned) - return (EPERM); - - /* must be writable by this zone */ - if (!writable) - return (EPERM); - } - return (0); -} - -int -zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) -{ - int error; - - error = zfs_dozonecheck(name, cr); - if (error == 0) { - error = secpolicy_zfs(cr); - if (error) - error = dsl_deleg_access(name, perm, cr); - } - return (error); -} - -static int -zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) -{ - /* - * Check permissions for special properties. - */ - switch (prop) { - case ZFS_PROP_ZONED: - /* - * Disallow setting of 'zoned' from within a local zone. - */ - if (!INGLOBALZONE(curproc)) - return (EPERM); - break; - - case ZFS_PROP_QUOTA: - if (!INGLOBALZONE(curproc)) { - uint64_t zoned; - char setpoint[MAXNAMELEN]; - /* - * Unprivileged users are allowed to modify the - * quota on things *under* (ie. contained by) - * the thing they own. - */ - if (dsl_prop_get_integer(name, "zoned", &zoned, - setpoint)) - return (EPERM); - if (!zoned || strlen(name) <= strlen(setpoint)) - return (EPERM); - } - break; - } - - return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr)); -} - -int -zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) -{ - int error; - - error = zfs_dozonecheck(zc->zc_name, cr); - if (error) - return (error); - - /* - * permission to set permissions will be evaluated later in - * dsl_deleg_can_allow() - */ - return (0); -} - -int -zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) -{ - int error; - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_ROLLBACK, cr); - if (error == 0) - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr); - return (error); -} - -int -zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) -{ - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); -} - -int -zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) -{ - if (!INGLOBALZONE(curproc)) - return (EPERM); - - if (secpolicy_nfs(cr) == 0) { - return (0); - } else { - vnode_t *vp; - int error; - - if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NO_FOLLOW, NULL, &vp)) != 0) - return (error); - - /* Now make sure mntpnt and dataset are ZFS */ - - if (vp->v_vfsp->vfs_fstype != zfsfstype || - (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), - zc->zc_name) != 0)) { - VN_RELE(vp); - return (EPERM); - } - - VN_RELE(vp); - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_SHARE, cr)); - } -} - -static int -zfs_get_parent(const char *datasetname, char *parent, int parentsize) -{ - char *cp; - - /* - * Remove the @bla or /bla from the end of the name to get the parent. - */ - (void) strncpy(parent, datasetname, parentsize); - cp = strrchr(parent, '@'); - if (cp != NULL) { - cp[0] = '\0'; - } else { - cp = strrchr(parent, '/'); - if (cp == NULL) - return (ENOENT); - cp[0] = '\0'; - } - - return (0); -} - -int -zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) -{ - int error; - - if ((error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); -} - -static int -zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) -{ - return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); -} - -/* - * Must have sys_config privilege to check the iscsi permission - */ -/* ARGSUSED */ -static int -zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) -{ - return (secpolicy_zfs(cr)); -} - -int -zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) -{ - char parentname[MAXNAMELEN]; - int error; - - if ((error = zfs_secpolicy_write_perms(from, - ZFS_DELEG_PERM_RENAME, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(from, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - if ((error = zfs_get_parent(to, parentname, - sizeof (parentname))) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_CREATE, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - return (error); -} - -static int -zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) -{ - return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); -} - -static int -zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) -{ - char parentname[MAXNAMELEN]; - objset_t *clone; - int error; - - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_PROMOTE, cr); - if (error) - return (error); - - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &clone); - - if (error == 0) { - dsl_dataset_t *pclone = NULL; - dsl_dir_t *dd; - dd = clone->os->os_dsl_dataset->ds_dir; - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - error = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, NULL, - DS_MODE_NONE, FTAG, &pclone); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (error) { - dmu_objset_close(clone); - return (error); - } - - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr); - - dsl_dataset_name(pclone, parentname); - dmu_objset_close(clone); - dsl_dataset_close(pclone, DS_MODE_NONE, FTAG); - if (error == 0) - error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_PROMOTE, cr); - } - return (error); -} - -static int -zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) -{ - int error; - - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_RECEIVE, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_CREATE, cr)); -} - -int -zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) -{ - int error; - - if ((error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0) - return (error); - - error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_MOUNT, cr); - - return (error); -} - -static int -zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) -{ - - return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); -} - -static int -zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) -{ - char parentname[MAXNAMELEN]; - int error; - - if ((error = zfs_get_parent(zc->zc_name, parentname, - sizeof (parentname))) != 0) - return (error); - - if (zc->zc_value[0] != '\0') { - if ((error = zfs_secpolicy_write_perms(zc->zc_value, - ZFS_DELEG_PERM_CLONE, cr)) != 0) - return (error); - } - - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_CREATE, cr)) != 0) - return (error); - - error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_MOUNT, cr); - - return (error); -} - -static int -zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) -{ - int error; - - error = secpolicy_fs_unmount(cr, NULL); - if (error) { - error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); - } - return (error); -} - -/* - * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires - * SYS_CONFIG privilege, which is not available in a local zone. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) -{ - if (secpolicy_sys_config(cr, B_FALSE) != 0) - return (EPERM); - - return (0); -} - -/* - * Just like zfs_secpolicy_config, except that we will check for - * mount permission on the dataset for permission to create/remove - * the minor nodes. - */ -static int -zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr) -{ - if (secpolicy_sys_config(cr, B_FALSE) != 0) { - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr)); - } - - return (0); -} - -/* - * Policy for fault injection. Requires all privileges. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) -{ - return (secpolicy_zinject(cr)); -} - -static int -zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) -{ - zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); - - if (prop == ZPROP_INVAL) { - if (!zfs_prop_user(zc->zc_value)) - return (EINVAL); - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_USERPROP, cr)); - } else { - if (!zfs_prop_inheritable(prop)) - return (EINVAL); - return (zfs_secpolicy_setprop(zc->zc_name, prop, cr)); - } -} - -/* - * Returns the nvlist as specified by the user in the zfs_cmd_t. - */ -static int -get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) -{ - char *packed; - int error; - nvlist_t *list = NULL; - - /* - * Read in and unpack the user-supplied nvlist. - */ - if (size == 0) - return (EINVAL); - - packed = kmem_alloc(size, KM_SLEEP); - - if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { - kmem_free(packed, size); - return (error); - } - - if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { - kmem_free(packed, size); - return (error); - } - - kmem_free(packed, size); - - *nvp = list; - return (0); -} - -static int -put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) -{ - char *packed = NULL; - size_t size; - int error; - - VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); - - if (size > zc->zc_nvlist_dst_size) { - error = ENOMEM; - } else { - packed = kmem_alloc(size, KM_SLEEP); - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - KM_SLEEP) == 0); - error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size); - kmem_free(packed, size); - } - - zc->zc_nvlist_dst_size = size; - return (error); -} - -static int -zfs_ioc_pool_create(zfs_cmd_t *zc) -{ - int error; - nvlist_t *config, *props = NULL; - char *buf; - - if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) - return (error); - - if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { - nvlist_free(config); - return (error); - } - - buf = history_str_get(zc); - - error = spa_create(zc->zc_name, config, props, buf); - - if (buf != NULL) - history_str_free(buf); - - nvlist_free(config); - - if (props) - nvlist_free(props); - - return (error); -} - -static int -zfs_ioc_pool_destroy(zfs_cmd_t *zc) -{ - int error; - zfs_log_history(zc); - error = spa_destroy(zc->zc_name); - return (error); -} - -static int -zfs_ioc_pool_import(zfs_cmd_t *zc) -{ - int error; - nvlist_t *config, *props = NULL; - uint64_t guid; - - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) != 0) - return (error); - - if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { - nvlist_free(config); - return (error); - } - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || - guid != zc->zc_guid) - error = EINVAL; - else - error = spa_import(zc->zc_name, config, props); - - nvlist_free(config); - - if (props) - nvlist_free(props); - - return (error); -} - -static int -zfs_ioc_pool_export(zfs_cmd_t *zc) -{ - int error; - zfs_log_history(zc); - error = spa_export(zc->zc_name, NULL); - return (error); -} - -static int -zfs_ioc_pool_configs(zfs_cmd_t *zc) -{ - nvlist_t *configs; - int error; - - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (EEXIST); - - error = put_nvlist(zc, configs); - - nvlist_free(configs); - - return (error); -} - -static int -zfs_ioc_pool_stats(zfs_cmd_t *zc) -{ - nvlist_t *config; - int error; - int ret = 0; - - error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); - - if (config != NULL) { - ret = put_nvlist(zc, config); - nvlist_free(config); - - /* - * The config may be present even if 'error' is non-zero. - * In this case we return success, and preserve the real errno - * in 'zc_cookie'. - */ - zc->zc_cookie = error; - } else { - ret = error; - } - - return (ret); -} - -/* - * Try to import the given pool, returning pool stats as appropriate so that - * user land knows which devices are available and overall pool health. - */ -static int -zfs_ioc_pool_tryimport(zfs_cmd_t *zc) -{ - nvlist_t *tryconfig, *config; - int error; - - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &tryconfig)) != 0) - return (error); - - config = spa_tryimport(tryconfig); - - nvlist_free(tryconfig); - - if (config == NULL) - return (EINVAL); - - error = put_nvlist(zc, config); - nvlist_free(config); - - return (error); -} - -static int -zfs_ioc_pool_scrub(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - mutex_enter(&spa_namespace_lock); - error = spa_scrub(spa, zc->zc_cookie, B_FALSE); - mutex_exit(&spa_namespace_lock); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_freeze(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error == 0) { - spa_freeze(spa); - spa_close(spa, FTAG); - } - return (error); -} - -static int -zfs_ioc_pool_upgrade(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { - spa_close(spa, FTAG); - return (EINVAL); - } - - spa_upgrade(spa, zc->zc_cookie); - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_get_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *hist_buf; - uint64_t size; - int error; - - if ((size = zc->zc_history_len) == 0) - return (EINVAL); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { - spa_close(spa, FTAG); - return (ENOTSUP); - } - - hist_buf = kmem_alloc(size, KM_SLEEP); - if ((error = spa_history_get(spa, &zc->zc_history_offset, - &zc->zc_history_len, hist_buf)) == 0) { - error = xcopyout(hist_buf, - (char *)(uintptr_t)zc->zc_history, - zc->zc_history_len); - } - - spa_close(spa, FTAG); - kmem_free(hist_buf, size); - return (error); -} - -static int -zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) -{ - int error; - - if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) - return (error); - - return (0); -} - -static int -zfs_ioc_obj_to_path(zfs_cmd_t *zc) -{ - objset_t *osp; - int error; - - if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, - DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0) - return (error); - - error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, - sizeof (zc->zc_value)); - dmu_objset_close(osp); - - return (error); -} - -static int -zfs_ioc_vdev_add(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - nvlist_t *config, **l2cache, **spares; - uint_t nl2cache = 0, nspares = 0; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config); - (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache); - - (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, - &spares, &nspares); - - /* - * A root pool with concatenated devices is not supported. - * Thus, can not add a device to a root pool. - * - * Intent log device can not be added to a rootpool because - * during mountroot, zil is replayed, a seperated log device - * can not be accessed during the mountroot time. - * - * l2cache and spare devices are ok to be added to a rootpool. - */ - if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) { - spa_close(spa, FTAG); - return (EDOM); - } - - if (error == 0) { - error = spa_vdev_add(spa, config); - nvlist_free(config); - } - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_remove(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_set_state(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - vdev_state_t newstate = VDEV_STATE_UNKNOWN; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - switch (zc->zc_cookie) { - case VDEV_STATE_ONLINE: - error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); - break; - - case VDEV_STATE_OFFLINE: - error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); - break; - - case VDEV_STATE_FAULTED: - error = vdev_fault(spa, zc->zc_guid); - break; - - case VDEV_STATE_DEGRADED: - error = vdev_degrade(spa, zc->zc_guid); - break; - - default: - error = EINVAL; - } - zc->zc_cookie = newstate; - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_attach(zfs_cmd_t *zc) -{ - spa_t *spa; - int replacing = zc->zc_cookie; - nvlist_t *config; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); - nvlist_free(config); - } - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_detach(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_setpath(zfs_cmd_t *zc) -{ - spa_t *spa; - char *path = zc->zc_value; - uint64_t guid = zc->zc_guid; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - error = spa_vdev_setpath(spa, guid, path); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_os_open_retry(char *name, objset_t **os) -{ - int error; - -retry: - error = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } - } - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - * zc_value alternate root - */ -static int -zfs_ioc_objset_stats(zfs_cmd_t *zc) -{ - objset_t *os = NULL; - int error; - nvlist_t *nv; - - if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) - return (error); - - dmu_objset_fast_stat(os, &zc->zc_objset_stats); - - if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_all(os, &nv)) == 0) { - dmu_objset_stats(os, nv); - /* - * NB: zvol_get_stats() will read the objset contents, - * which we aren't supposed to do with a - * DS_MODE_STANDARD open, because it could be - * inconsistent. So this is a bit of a workaround... - */ - if (!zc->zc_objset_stats.dds_inconsistent) { - if (dmu_objset_type(os) == DMU_OST_ZVOL) - VERIFY(zvol_get_stats(os, nv) == 0); - } - error = put_nvlist(zc, nv); - nvlist_free(nv); - } - - spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value)); - - dmu_objset_close(os); - return (error); -} - -static int -nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) -{ - uint64_t value; - int error; - - /* - * zfs_get_zplprop() will either find a value or give us - * the default value (if there is one). - */ - if ((error = zfs_get_zplprop(os, prop, &value)) != 0) - return (error); - VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); - return (0); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for zpl property nvlist - * - * outputs: - * zc_nvlist_dst zpl property nvlist - * zc_nvlist_dst_size size of zpl property nvlist - */ -static int -zfs_ioc_objset_zplprops(zfs_cmd_t *zc) -{ - objset_t *os; - int err; - - if ((err = zfs_os_open_retry(zc->zc_name, &os)) != 0) - return (err); - - dmu_objset_fast_stat(os, &zc->zc_objset_stats); - - /* - * NB: nvl_add_zplprop() will read the objset contents, - * which we aren't supposed to do with a DS_MODE_STANDARD - * open, because it could be inconsistent. - */ - if (zc->zc_nvlist_dst != NULL && - !zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZFS) { - nvlist_t *nv; - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && - (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && - (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && - (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) - err = put_nvlist(zc, nv); - nvlist_free(nv); - } else { - err = ENOENT; - } - dmu_objset_close(os); - return (err); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_cookie zap cursor - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_name name of next filesystem - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - * zc_value alternate root - */ -static int -zfs_ioc_dataset_list_next(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - char *p; - - if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) { - if (error == ENOENT) - error = ESRCH; - return (error); - } - - p = strrchr(zc->zc_name, '/'); - if (p == NULL || p[1] != '\0') - (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); - p = zc->zc_name + strlen(zc->zc_name); - - do { - error = dmu_dir_list_next(os, - sizeof (zc->zc_name) - (p - zc->zc_name), p, - NULL, &zc->zc_cookie); - if (error == ENOENT) - error = ESRCH; - } while (error == 0 && !INGLOBALZONE(curproc) && - !zone_dataset_visible(zc->zc_name, NULL)); - - /* - * If it's a hidden dataset (ie. with a '$' in its name), don't - * try to get stats for it. Userland will skip over it. - */ - if (error == 0 && strchr(zc->zc_name, '$') == NULL) - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - - dmu_objset_close(os); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_cookie zap cursor - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_name name of next snapshot - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - * zc_value alternate root - */ -static int -zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - - if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) { - if (error == ENOENT) - error = ESRCH; - return (error); - } - - /* - * A dataset name of maximum length cannot have any snapshots, - * so exit immediately. - */ - if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { - dmu_objset_close(os); - return (ESRCH); - } - - error = dmu_snapshot_list_next(os, - sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); - if (error == ENOENT) - error = ESRCH; - - if (error == 0) - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - - /* if we failed, undo the @ that we tacked on to zc_name */ - if (error != 0) - *strchr(zc->zc_name, '@') = '\0'; - - dmu_objset_close(os); - return (error); -} - -int -zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) -{ - nvpair_t *elem; - int error; - uint64_t intval; - char *strval; - - /* - * First validate permission to set all of the properties - */ - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); - - if (prop == ZPROP_INVAL) { - /* - * If this is a user-defined property, it must be a - * string, and there is no further validation to do. - */ - if (!zfs_prop_user(propname) || - nvpair_type(elem) != DATA_TYPE_STRING) - return (EINVAL); - - if (error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_USERPROP, CRED())) - return (error); - continue; - } - - if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) - return (error); - - /* - * Check that this value is valid for this pool version - */ - switch (prop) { - case ZFS_PROP_COMPRESSION: - /* - * If the user specified gzip compression, make sure - * the SPA supports it. We ignore any errors here since - * we'll catch them later. - */ - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0 && - intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9) { - if (zfs_check_version(name, - SPA_VERSION_GZIP_COMPRESSION)) - return (ENOTSUP); - } - break; - - case ZFS_PROP_COPIES: - if (zfs_check_version(name, SPA_VERSION_DITTO_BLOCKS)) - return (ENOTSUP); - break; - - case ZFS_PROP_SHARESMB: - if (zpl_check_version(name, ZPL_VERSION_FUID)) - return (ENOTSUP); - break; - } - if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) - return (error); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); - - if (prop == ZPROP_INVAL) { - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else - return (error); - } - - switch (prop) { - case ZFS_PROP_QUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_quota(name, intval)) != 0) - return (error); - break; - - case ZFS_PROP_REFQUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dataset_set_quota(name, intval)) != 0) - return (error); - break; - - case ZFS_PROP_RESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_reservation(name, - intval)) != 0) - return (error); - break; - - case ZFS_PROP_REFRESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dataset_set_reservation(name, - intval)) != 0) - return (error); - break; - - case ZFS_PROP_VOLSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volsize(name, - ddi_driver_major(zfs_dip), intval)) != 0) - return (error); - break; - - case ZFS_PROP_VOLBLOCKSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volblocksize(name, intval)) != 0) - return (error); - break; - - case ZFS_PROP_VERSION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zfs_set_version(name, intval)) != 0) - return (error); - break; - - default: - if (nvpair_type(elem) == DATA_TYPE_STRING) { - if (zfs_prop_get_type(prop) != - PROP_TYPE_STRING) - return (EINVAL); - VERIFY(nvpair_value_string(elem, &strval) == 0); - if ((error = dsl_prop_set(name, - nvpair_name(elem), 1, strlen(strval) + 1, - strval)) != 0) - return (error); - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { - const char *unused; - - VERIFY(nvpair_value_uint64(elem, &intval) == 0); - - switch (zfs_prop_get_type(prop)) { - case PROP_TYPE_NUMBER: - break; - case PROP_TYPE_STRING: - return (EINVAL); - case PROP_TYPE_INDEX: - if (zfs_prop_index_to_string(prop, - intval, &unused) != 0) - return (EINVAL); - break; - default: - cmn_err(CE_PANIC, - "unknown property type"); - break; - } - - if ((error = dsl_prop_set(name, propname, - 8, 1, &intval)) != 0) - return (error); - } else { - return (EINVAL); - } - break; - } - } - - return (0); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value name of property to inherit - * zc_nvlist_src{_size} nvlist of properties to apply - * - * outputs: none - */ -static int -zfs_ioc_set_prop(zfs_cmd_t *zc) -{ - nvlist_t *nvl; - int error; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvl)) != 0) - return (error); - - error = zfs_set_prop_nvlist(zc->zc_name, nvl); - - nvlist_free(nvl); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value name of property to inherit - * - * outputs: none - */ -static int -zfs_ioc_inherit_prop(zfs_cmd_t *zc) -{ - /* the property name has been validated by zfs_secpolicy_inherit() */ - return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); -} - -static int -zfs_ioc_pool_set_props(zfs_cmd_t *zc) -{ - nvlist_t *props; - spa_t *spa; - int error; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props))) - return (error); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { - nvlist_free(props); - return (error); - } - - error = spa_prop_set(spa, props); - - nvlist_free(props); - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_get_props(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - nvlist_t *nvp = NULL; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_prop_get(spa, &nvp); - - if (error == 0 && zc->zc_nvlist_dst != NULL) - error = put_nvlist(zc, nvp); - else - error = EFAULT; - - spa_close(spa, FTAG); - - if (nvp) - nvlist_free(nvp); - return (error); -} - -static int -zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) -{ - nvlist_t *nvp; - int error; - uint32_t uid; - uint32_t gid; - uint32_t *groups; - uint_t group_cnt; - cred_t *usercred; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvp)) != 0) { - return (error); - } - - if ((error = nvlist_lookup_uint32(nvp, - ZFS_DELEG_PERM_UID, &uid)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - - if ((error = nvlist_lookup_uint32(nvp, - ZFS_DELEG_PERM_GID, &gid)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - - if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS, - &groups, &group_cnt)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - usercred = cralloc(); - if ((crsetugid(usercred, uid, gid) != 0) || - (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) { - nvlist_free(nvp); - crfree(usercred); - return (EPERM); - } - nvlist_free(nvp); - error = dsl_deleg_access(zc->zc_name, - zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); - crfree(usercred); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_src{_size} nvlist of delegated permissions - * zc_perm_action allow/unallow flag - * - * outputs: none - */ -static int -zfs_ioc_set_fsacl(zfs_cmd_t *zc) -{ - int error; - nvlist_t *fsaclnv = NULL; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &fsaclnv)) != 0) - return (error); - - /* - * Verify nvlist is constructed correctly - */ - if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { - nvlist_free(fsaclnv); - return (EINVAL); - } - - /* - * If we don't have PRIV_SYS_MOUNT, then validate - * that user is allowed to hand out each permission in - * the nvlist(s) - */ - - error = secpolicy_zfs(CRED()); - if (error) { - if (zc->zc_perm_action == B_FALSE) { - error = dsl_deleg_can_allow(zc->zc_name, - fsaclnv, CRED()); - } else { - error = dsl_deleg_can_unallow(zc->zc_name, - fsaclnv, CRED()); - } - } - - if (error == 0) - error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); - - nvlist_free(fsaclnv); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * - * outputs: - * zc_nvlist_src{_size} nvlist of delegated permissions - */ -static int -zfs_ioc_get_fsacl(zfs_cmd_t *zc) -{ - nvlist_t *nvp; - int error; - - if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { - error = put_nvlist(zc, nvp); - nvlist_free(nvp); - } - - return (error); -} - -/* - * inputs: - * zc_name name of volume - * - * outputs: none - */ -static int -zfs_ioc_create_minor(zfs_cmd_t *zc) -{ - return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); -} - -/* - * inputs: - * zc_name name of volume - * - * outputs: none - */ -static int -zfs_ioc_remove_minor(zfs_cmd_t *zc) -{ - return (zvol_remove_minor(zc->zc_name)); -} - -/* - * Search the vfs list for a specified resource. Returns a pointer to it - * or NULL if no suitable entry is found. The caller of this routine - * is responsible for releasing the returned vfs pointer. - */ -static vfs_t * -zfs_get_vfs(const char *resource) -{ - struct vfs *vfsp; - struct vfs *vfs_found = NULL; - - vfs_list_read_lock(); - vfsp = rootvfs; - do { - if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { - VFS_HOLD(vfsp); - vfs_found = vfsp; - break; - } - vfsp = vfsp->vfs_next; - } while (vfsp != rootvfs); - vfs_list_unlock(); - return (vfs_found); -} - -/* ARGSUSED */ -static void -zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) -{ - zfs_creat_t *zct = arg; - - zfs_create_fs(os, cr, zct->zct_zplprops, tx); -} - -#define ZFS_PROP_UNDEFINED ((uint64_t)-1) - -/* - * inputs: - * createprops list of properties requested by creator - * dataset name of dataset we are creating - * - * outputs: - * zplprops values for the zplprops we attach to the master node object - * - * Determine the settings for utf8only, normalization and - * casesensitivity. Specific values may have been requested by the - * creator and/or we can inherit values from the parent dataset. If - * the file system is of too early a vintage, a creator can not - * request settings for these properties, even if the requested - * setting is the default value. We don't actually want to create dsl - * properties for these, so remove them from the source nvlist after - * processing. - */ -static int -zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, - nvlist_t *zplprops, uint64_t zplver, boolean_t *is_ci) -{ - objset_t *os; - char parentname[MAXNAMELEN]; - char *cp; - uint64_t sense = ZFS_PROP_UNDEFINED; - uint64_t norm = ZFS_PROP_UNDEFINED; - uint64_t u8 = ZFS_PROP_UNDEFINED; - int error = 0; - - ASSERT(zplprops != NULL); - - (void) strlcpy(parentname, dataset, sizeof (parentname)); - cp = strrchr(parentname, '/'); - ASSERT(cp != NULL); - cp[0] = '\0'; - - /* - * Pull out creator prop choices, if any. - */ - if (createprops) { - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); - (void) nvlist_remove_all(createprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE)); - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); - (void) nvlist_remove_all(createprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_CASE), &sense); - (void) nvlist_remove_all(createprops, - zfs_prop_to_name(ZFS_PROP_CASE)); - } - - /* - * If the file system or pool is version is too "young" to - * support normalization and the creator tried to set a value - * for one of the props, error out. We only need check the - * ZPL version because we've already checked by now that the - * SPA version is compatible with the selected ZPL version. - */ - if (zplver < ZPL_VERSION_NORMALIZATION && - (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || - sense != ZFS_PROP_UNDEFINED)) - return (ENOTSUP); - - /* - * Put the version in the zplprops - */ - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - - /* - * Open parent object set so we can inherit zplprop values if - * necessary. - */ - if ((error = zfs_os_open_retry(parentname, &os)) != 0) - return (error); - - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); - - /* - * If we're normalizing, names must always be valid UTF-8 strings. - */ - if (norm) - u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); - - if (is_ci) - *is_ci = (sense == ZFS_CASE_INSENSITIVE); - - dmu_objset_close(os); - return (0); -} - -/* - * inputs: - * zc_objset_type type of objset to create (fs vs zvol) - * zc_name name of new objset - * zc_value name of snapshot to clone from (may be empty) - * zc_nvlist_src{_size} nvlist of properties to apply - * - * outputs: none - */ -static int -zfs_ioc_create(zfs_cmd_t *zc) -{ - objset_t *clone; - int error = 0; - zfs_creat_t zct; - nvlist_t *nvprops = NULL; - void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - dmu_objset_type_t type = zc->zc_objset_type; - - switch (type) { - - case DMU_OST_ZFS: - cbfunc = zfs_create_cb; - break; - - case DMU_OST_ZVOL: - cbfunc = zvol_create_cb; - break; - - default: - cbfunc = NULL; - break; - } - if (strchr(zc->zc_name, '@') || - strchr(zc->zc_name, '%')) - return (EINVAL); - - if (zc->zc_nvlist_src != NULL && - (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) - return (error); - - zct.zct_zplprops = NULL; - zct.zct_props = nvprops; - - if (zc->zc_value[0] != '\0') { - /* - * We're creating a clone of an existing snapshot. - */ - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { - nvlist_free(nvprops); - return (EINVAL); - } - - error = dmu_objset_open(zc->zc_value, type, - DS_MODE_STANDARD | DS_MODE_READONLY, &clone); - if (error) { - nvlist_free(nvprops); - return (error); - } - - error = dmu_objset_create(zc->zc_name, type, clone, 0, - NULL, NULL); - if (error) { - dmu_objset_close(clone); - nvlist_free(nvprops); - return (error); - } - dmu_objset_close(clone); - } else { - boolean_t is_insensitive = B_FALSE; - - if (cbfunc == NULL) { - nvlist_free(nvprops); - return (EINVAL); - } - - if (type == DMU_OST_ZVOL) { - uint64_t volsize, volblocksize; - - if (nvprops == NULL || - nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &volsize) != 0) { - nvlist_free(nvprops); - return (EINVAL); - } - - if ((error = nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize)) != 0 && error != ENOENT) { - nvlist_free(nvprops); - return (EINVAL); - } - - if (error != 0) - volblocksize = zfs_prop_default_numeric( - ZFS_PROP_VOLBLOCKSIZE); - - if ((error = zvol_check_volblocksize( - volblocksize)) != 0 || - (error = zvol_check_volsize(volsize, - volblocksize)) != 0) { - nvlist_free(nvprops); - return (error); - } - } else if (type == DMU_OST_ZFS) { - uint64_t version; - int error; - - /* - * Default ZPL version to non-FUID capable if the - * pool is not upgraded to support FUIDs. - */ - if (zfs_check_version(zc->zc_name, SPA_VERSION_FUID)) - version = ZPL_VERSION_FUID - 1; - else - version = ZPL_VERSION; - - /* - * Potentially override default ZPL version based - * on creator's request. - */ - (void) nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VERSION), &version); - - /* - * Make sure version we ended up with is kosher - */ - if ((version < ZPL_VERSION_INITIAL || - version > ZPL_VERSION) || - (version >= ZPL_VERSION_FUID && - zfs_check_version(zc->zc_name, SPA_VERSION_FUID))) { - nvlist_free(nvprops); - return (ENOTSUP); - } - - /* - * We have to have normalization and - * case-folding flags correct when we do the - * file system creation, so go figure them out - * now. - */ - VERIFY(nvlist_alloc(&zct.zct_zplprops, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - error = zfs_fill_zplprops(zc->zc_name, nvprops, - zct.zct_zplprops, version, &is_insensitive); - if (error != 0) { - nvlist_free(nvprops); - nvlist_free(zct.zct_zplprops); - return (error); - } - } - error = dmu_objset_create(zc->zc_name, type, NULL, - is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); - nvlist_free(zct.zct_zplprops); - } - - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) - (void) dmu_objset_destroy(zc->zc_name); - } - nvlist_free(nvprops); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot - * zc_cookie recursive flag - * - * outputs: none - */ -static int -zfs_ioc_snapshot(zfs_cmd_t *zc) -{ - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - return (dmu_objset_snapshot(zc->zc_name, - zc->zc_value, zc->zc_cookie)); -} - -int -zfs_unmount_snap(char *name, void *arg) -{ - char *snapname = arg; - char *cp; - vfs_t *vfsp = NULL; - - /* - * Snapshots (which are under .zfs control) must be unmounted - * before they can be destroyed. - */ - - if (snapname) { - (void) strcat(name, "@"); - (void) strcat(name, snapname); - vfsp = zfs_get_vfs(name); - cp = strchr(name, '@'); - *cp = '\0'; - } else if (strchr(name, '@')) { - vfsp = zfs_get_vfs(name); - } - - if (vfsp) { - /* - * Always force the unmount for snapshots. - */ - int flag = MS_FORCE; - int err; - - if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { - VFS_RELE(vfsp); - return (err); - } - VFS_RELE(vfsp); - if ((err = dounmount(vfsp, flag, kcred)) != 0) - return (err); - } - return (0); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot - * - * outputs: none - */ -static int -zfs_ioc_destroy_snaps(zfs_cmd_t *zc) -{ - int err; - - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - err = dmu_objset_find(zc->zc_name, - zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); - if (err) - return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); -} - -/* - * inputs: - * zc_name name of dataset to destroy - * zc_objset_type type of objset - * - * outputs: none - */ -static int -zfs_ioc_destroy(zfs_cmd_t *zc) -{ - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } - - return (dmu_objset_destroy(zc->zc_name)); -} - -/* - * inputs: - * zc_name name of dataset to rollback (to most recent snapshot) - * - * outputs: none - */ -static int -zfs_ioc_rollback(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - zfsvfs_t *zfsvfs = NULL; - - /* - * Get the zfsvfs for the receiving objset. There - * won't be one if we're operating on a zvol, if the - * objset doesn't exist yet, or is not mounted. - */ - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD, &os); - if (error) - return (error); - - if (dmu_objset_type(os) == DMU_OST_ZFS) { - mutex_enter(&os->os->os_user_ptr_lock); - zfsvfs = dmu_objset_get_user(os); - if (zfsvfs != NULL) - VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&os->os->os_user_ptr_lock); - } - - if (zfsvfs != NULL) { - char osname[MAXNAMELEN]; - int mode; - - error = zfs_suspend_fs(zfsvfs, osname, &mode); - if (error == 0) { - int resume_err; - - ASSERT(strcmp(osname, zc->zc_name) == 0); - error = dmu_objset_rollback(os); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); - error = error ? error : resume_err; - } else { - dmu_objset_close(os); - } - VFS_RELE(zfsvfs->z_vfs); - } else { - error = dmu_objset_rollback(os); - } - /* Note, the dmu_objset_rollback() closes the objset for us. */ - - return (error); -} - -/* - * inputs: - * zc_name old name of dataset - * zc_value new name of dataset - * zc_cookie recursive flag (only valid for snapshots) - * - * outputs: none - */ -static int -zfs_ioc_rename(zfs_cmd_t *zc) -{ - boolean_t recursive = zc->zc_cookie & 1; - - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '%')) - return (EINVAL); - - /* - * Unmount snapshot unless we're doing a recursive rename, - * in which case the dataset code figures out which snapshots - * to unmount. - */ - if (!recursive && strchr(zc->zc_name, '@') != NULL && - zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } - - return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); -} - -/* - * inputs: - * zc_name name of containing filesystem - * zc_nvlist_src{_size} nvlist of properties to apply - * zc_value name of snapshot to create - * zc_string name of clone origin (if DRR_FLAG_CLONE) - * zc_cookie file descriptor to recv from - * zc_begin_record the BEGIN record of the stream (not byteswapped) - * zc_guid force flag - * - * outputs: - * zc_cookie number of bytes read - */ -static int -zfs_ioc_recv(zfs_cmd_t *zc) -{ - file_t *fp; - objset_t *os; - dmu_recv_cookie_t drc; - zfsvfs_t *zfsvfs = NULL; - boolean_t force = (boolean_t)zc->zc_guid; - int error, fd; - offset_t off; - nvlist_t *props = NULL; - objset_t *origin = NULL; - char *tosnap; - char tofs[ZFS_MAXNAMELEN]; - - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '@') == NULL || - strchr(zc->zc_value, '%')) - return (EINVAL); - - (void) strcpy(tofs, zc->zc_value); - tosnap = strchr(tofs, '@'); - *tosnap = '\0'; - tosnap++; - - if (zc->zc_nvlist_src != NULL && - (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props)) != 0) - return (error); - - fd = zc->zc_cookie; - fp = getf(fd); - if (fp == NULL) { - nvlist_free(props); - return (EBADF); - } - - /* - * Get the zfsvfs for the receiving objset. There - * won't be one if we're operating on a zvol, if the - * objset doesn't exist yet, or is not mounted. - */ - - error = dmu_objset_open(tofs, DMU_OST_ZFS, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (!error) { - mutex_enter(&os->os->os_user_ptr_lock); - zfsvfs = dmu_objset_get_user(os); - if (zfsvfs != NULL) { - VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&os->os->os_user_ptr_lock); - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - VFS_RELE(zfsvfs->z_vfs); - dmu_objset_close(os); - nvlist_free(props); - releasef(fd); - return (EBUSY); - } - } else { - mutex_exit(&os->os->os_user_ptr_lock); - } - dmu_objset_close(os); - } - - if (zc->zc_string[0]) { - error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &origin); - if (error) { - if (zfsvfs != NULL) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } - nvlist_free(props); - releasef(fd); - return (error); - } - } - - error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, - force, origin, zfsvfs != NULL, &drc); - if (origin) - dmu_objset_close(origin); - if (error) { - if (zfsvfs != NULL) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } - nvlist_free(props); - releasef(fd); - return (error); - } - - /* - * If properties are supplied, they are to completely replace - * the existing ones; "inherit" any existing properties. - */ - if (props) { - objset_t *os; - nvlist_t *nv = NULL; - - error = dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - &os); - if (error == 0) { - error = dsl_prop_get_all(os, &nv); - dmu_objset_close(os); - } - if (error == 0) { - nvpair_t *elem; - zfs_cmd_t *zc2; - zc2 = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); - - (void) strcpy(zc2->zc_name, tofs); - for (elem = nvlist_next_nvpair(nv, NULL); elem; - elem = nvlist_next_nvpair(nv, elem)) { - (void) strcpy(zc2->zc_value, nvpair_name(elem)); - if (zfs_secpolicy_inherit(zc2, CRED()) == 0) - (void) zfs_ioc_inherit_prop(zc2); - } - kmem_free(zc2, sizeof (zfs_cmd_t)); - } - if (nv) - nvlist_free(nv); - } - - /* - * Set properties. Note, we ignore errors. Would be better to - * do best-effort in zfs_set_prop_nvlist, too. - */ - (void) zfs_set_prop_nvlist(tofs, props); - nvlist_free(props); - - off = fp->f_offset; - error = dmu_recv_stream(&drc, fp->f_vnode, &off); - - if (error == 0) { - if (zfsvfs != NULL) { - char osname[MAXNAMELEN]; - int mode; - - error = zfs_suspend_fs(zfsvfs, osname, &mode); - if (error == 0) { - int resume_err; - - error = dmu_recv_end(&drc); - resume_err = zfs_resume_fs(zfsvfs, - osname, mode); - error = error ? error : resume_err; - } else { - dmu_recv_abort_cleanup(&drc); - } - } else { - error = dmu_recv_end(&drc); - } - } - if (zfsvfs != NULL) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } - - zc->zc_cookie = off - fp->f_offset; - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - - releasef(fd); - return (error); -} - -/* - * inputs: - * zc_name name of snapshot to send - * zc_value short name of incremental fromsnap (may be empty) - * zc_cookie file descriptor to send stream to - * zc_obj fromorigin flag (mutually exclusive with zc_value) - * - * outputs: none - */ -static int -zfs_ioc_send(zfs_cmd_t *zc) -{ - objset_t *fromsnap = NULL; - objset_t *tosnap; - file_t *fp; - int error; - offset_t off; - - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap); - if (error) - return (error); - - if (zc->zc_value[0] != '\0') { - char buf[MAXPATHLEN]; - char *cp; - - (void) strncpy(buf, zc->zc_name, sizeof (buf)); - cp = strchr(buf, '@'); - if (cp) - *(cp+1) = 0; - (void) strncat(buf, zc->zc_value, sizeof (buf)); - error = dmu_objset_open(buf, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap); - if (error) { - dmu_objset_close(tosnap); - return (error); - } - } - - fp = getf(zc->zc_cookie); - if (fp == NULL) { - dmu_objset_close(tosnap); - if (fromsnap) - dmu_objset_close(fromsnap); - return (EBADF); - } - - off = fp->f_offset; - error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off); - - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - releasef(zc->zc_cookie); - if (fromsnap) - dmu_objset_close(fromsnap); - dmu_objset_close(tosnap); - return (error); -} - -static int -zfs_ioc_inject_fault(zfs_cmd_t *zc) -{ - int id, error; - - error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, - &zc->zc_inject_record); - - if (error == 0) - zc->zc_guid = (uint64_t)id; - - return (error); -} - -static int -zfs_ioc_clear_fault(zfs_cmd_t *zc) -{ - return (zio_clear_fault((int)zc->zc_guid)); -} - -static int -zfs_ioc_inject_list_next(zfs_cmd_t *zc) -{ - int id = (int)zc->zc_guid; - int error; - - error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), - &zc->zc_inject_record); - - zc->zc_guid = id; - - return (error); -} - -static int -zfs_ioc_error_log(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, - &count); - if (error == 0) - zc->zc_nvlist_dst_size = count; - else - zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_clear(zfs_cmd_t *zc) -{ - spa_t *spa; - vdev_t *vd; - uint64_t txg; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - /* - * Try to resume any I/Os which may have been suspended - * as a result of a complete pool failure. - */ - if (!list_is_empty(&spa->spa_zio_list)) { - if (zio_vdev_resume_io(spa) != 0) { - spa_close(spa, FTAG); - return (EIO); - } - } - - txg = spa_vdev_enter(spa); - - if (zc->zc_guid == 0) { - vd = NULL; - } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - spa_aux_vdev_t *sav; - int i; - - /* - * Check if this is an l2cache device. - */ - ASSERT(spa != NULL); - sav = &spa->spa_l2cache; - for (i = 0; i < sav->sav_count; i++) { - if (sav->sav_vdevs[i]->vdev_guid == zc->zc_guid) { - vd = sav->sav_vdevs[i]; - break; - } - } - - if (vd == NULL) { - (void) spa_vdev_exit(spa, NULL, txg, ENODEV); - spa_close(spa, FTAG); - return (ENODEV); - } - } - - vdev_clear(spa, vd, B_TRUE); - - (void) spa_vdev_exit(spa, NULL, txg, 0); - - spa_close(spa, FTAG); - - return (0); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value name of origin snapshot - * - * outputs: none - */ -static int -zfs_ioc_promote(zfs_cmd_t *zc) -{ - char *cp; - - /* - * We don't need to unmount *all* the origin fs's snapshots, but - * it's easier. - */ - cp = strchr(zc->zc_value, '@'); - if (cp) - *cp = '\0'; - (void) dmu_objset_find(zc->zc_value, - zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name)); -} - -/* - * We don't want to have a hard dependency - * against some special symbols in sharefs - * nfs, and smbsrv. Determine them if needed when - * the first file system is shared. - * Neither sharefs, nfs or smbsrv are unloadable modules. - */ -int (*znfsexport_fs)(void *arg); -int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); -int (*zsmbexport_fs)(void *arg, boolean_t add_share); - -int zfs_nfsshare_inited; -int zfs_smbshare_inited; - -ddi_modhandle_t nfs_mod; -ddi_modhandle_t sharefs_mod; -ddi_modhandle_t smbsrv_mod; -kmutex_t zfs_share_lock; - -static int -zfs_init_sharefs() -{ - int error; - - ASSERT(MUTEX_HELD(&zfs_share_lock)); - /* Both NFS and SMB shares also require sharetab support. */ - if (sharefs_mod == NULL && ((sharefs_mod = - ddi_modopen("fs/sharefs", - KRTLD_MODE_FIRST, &error)) == NULL)) { - return (ENOSYS); - } - if (zshare_fs == NULL && ((zshare_fs = - (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) - ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { - return (ENOSYS); - } - return (0); -} - -static int -zfs_ioc_share(zfs_cmd_t *zc) -{ - int error; - int opcode; - - switch (zc->zc_share.z_sharetype) { - case ZFS_SHARE_NFS: - case ZFS_UNSHARE_NFS: - if (zfs_nfsshare_inited == 0) { - mutex_enter(&zfs_share_lock); - if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", - KRTLD_MODE_FIRST, &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (ENOSYS); - } - if (znfsexport_fs == NULL && - ((znfsexport_fs = (int (*)(void *)) - ddi_modsym(nfs_mod, - "nfs_export", &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (ENOSYS); - } - error = zfs_init_sharefs(); - if (error) { - mutex_exit(&zfs_share_lock); - return (ENOSYS); - } - zfs_nfsshare_inited = 1; - mutex_exit(&zfs_share_lock); - } - break; - case ZFS_SHARE_SMB: - case ZFS_UNSHARE_SMB: - if (zfs_smbshare_inited == 0) { - mutex_enter(&zfs_share_lock); - if (smbsrv_mod == NULL && ((smbsrv_mod = - ddi_modopen("drv/smbsrv", - KRTLD_MODE_FIRST, &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (ENOSYS); - } - if (zsmbexport_fs == NULL && ((zsmbexport_fs = - (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, - "smb_server_share", &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (ENOSYS); - } - error = zfs_init_sharefs(); - if (error) { - mutex_exit(&zfs_share_lock); - return (ENOSYS); - } - zfs_smbshare_inited = 1; - mutex_exit(&zfs_share_lock); - } - break; - default: - return (EINVAL); - } - - switch (zc->zc_share.z_sharetype) { - case ZFS_SHARE_NFS: - case ZFS_UNSHARE_NFS: - if (error = - znfsexport_fs((void *) - (uintptr_t)zc->zc_share.z_exportdata)) - return (error); - break; - case ZFS_SHARE_SMB: - case ZFS_UNSHARE_SMB: - if (error = zsmbexport_fs((void *) - (uintptr_t)zc->zc_share.z_exportdata, - zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? - B_TRUE : B_FALSE)) { - return (error); - } - break; - } - - opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || - zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? - SHAREFS_ADD : SHAREFS_REMOVE; - - /* - * Add or remove share from sharetab - */ - error = zshare_fs(opcode, - (void *)(uintptr_t)zc->zc_share.z_sharedata, - zc->zc_share.z_sharemax); - - return (error); - -} - -/* - * pool create, destroy, and export don't log the history as part of - * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export - * do the logging of those commands. - */ -static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, - { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, - { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, - { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, - { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, - { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, - DATASET_NAME, B_FALSE }, - { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, - { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, -}; - -static int -zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) -{ - zfs_cmd_t *zc; - uint_t vec; - int error, rc; - - if (getminor(dev) != 0) - return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); - - vec = cmd - ZFS_IOC; - ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); - - if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (EINVAL); - - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - - error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t)); - - if (error == 0) - error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); - - /* - * Ensure that all pool/dataset names are valid before we pass down to - * the lower layers. - */ - if (error == 0) { - zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; - switch (zfs_ioc_vec[vec].zvec_namecheck) { - case POOL_NAME: - if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) - error = EINVAL; - break; - - case DATASET_NAME: - if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) - error = EINVAL; - break; - - case NO_NAME: - break; - } - } - - if (error == 0) - error = zfs_ioc_vec[vec].zvec_func(zc); - - rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t)); - if (error == 0) { - error = rc; - if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) - zfs_log_history(zc); - } - - kmem_free(zc, sizeof (zfs_cmd_t)); - return (error); -} - -static int -zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - if (cmd != DDI_ATTACH) - return (DDI_FAILURE); - - if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, - DDI_PSEUDO, 0) == DDI_FAILURE) - return (DDI_FAILURE); - - zfs_dip = dip; - - ddi_report_dev(dip); - - return (DDI_SUCCESS); -} - -static int -zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - if (spa_busy() || zfs_busy() || zvol_busy()) - return (DDI_FAILURE); - - if (cmd != DDI_DETACH) - return (DDI_FAILURE); - - zfs_dip = NULL; - - ddi_prop_remove_all(dip); - ddi_remove_minor_node(dip, NULL); - - return (DDI_SUCCESS); -} - -/*ARGSUSED*/ -static int -zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) -{ - switch (infocmd) { - case DDI_INFO_DEVT2DEVINFO: - *result = zfs_dip; - return (DDI_SUCCESS); - - case DDI_INFO_DEVT2INSTANCE: - *result = (void *)0; - return (DDI_SUCCESS); - } - - return (DDI_FAILURE); -} - -/* - * OK, so this is a little weird. - * - * /dev/zfs is the control node, i.e. minor 0. - * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. - * - * /dev/zfs has basically nothing to do except serve up ioctls, - * so most of the standard driver entry points are in zvol.c. - */ -static struct cb_ops zfs_cb_ops = { - zvol_open, /* open */ - zvol_close, /* close */ - zvol_strategy, /* strategy */ - nodev, /* print */ - zvol_dump, /* dump */ - zvol_read, /* read */ - zvol_write, /* write */ - zfsdev_ioctl, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, /* prop_op */ - NULL, /* streamtab */ - D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ - CB_REV, /* version */ - nodev, /* async read */ - nodev, /* async write */ -}; - -static struct dev_ops zfs_dev_ops = { - DEVO_REV, /* version */ - 0, /* refcnt */ - zfs_info, /* info */ - nulldev, /* identify */ - nulldev, /* probe */ - zfs_attach, /* attach */ - zfs_detach, /* detach */ - nodev, /* reset */ - &zfs_cb_ops, /* driver operations */ - NULL /* no bus operations */ -}; - -static struct modldrv zfs_modldrv = { - &mod_driverops, "ZFS storage pool version " SPA_VERSION_STRING, - &zfs_dev_ops -}; - -static struct modlinkage modlinkage = { - MODREV_1, - (void *)&zfs_modlfs, - (void *)&zfs_modldrv, - NULL -}; - - -uint_t zfs_fsyncer_key; -extern uint_t rrw_tsd_key; - -int -_init(void) -{ - int error; - - spa_init(FREAD | FWRITE); - zfs_init(); - zvol_init(); - - if ((error = mod_install(&modlinkage)) != 0) { - zvol_fini(); - zfs_fini(); - spa_fini(); - return (error); - } - - tsd_create(&zfs_fsyncer_key, NULL); - tsd_create(&rrw_tsd_key, NULL); - - error = ldi_ident_from_mod(&modlinkage, &zfs_li); - ASSERT(error == 0); - mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); - - return (0); -} - -int -_fini(void) -{ - int error; - - if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) - return (EBUSY); - - if ((error = mod_remove(&modlinkage)) != 0) - return (error); - - zvol_fini(); - zfs_fini(); - spa_fini(); - if (zfs_nfsshare_inited) - (void) ddi_modclose(nfs_mod); - if (zfs_smbshare_inited) - (void) ddi_modclose(smbsrv_mod); - if (zfs_nfsshare_inited || zfs_smbshare_inited) - (void) ddi_modclose(sharefs_mod); - - tsd_destroy(&zfs_fsyncer_key); - ldi_ident_release(zfs_li); - zfs_li = NULL; - mutex_destroy(&zfs_share_lock); - - return (error); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} diff --git a/zfs/lib/libdmu-ctl/zfs_log.c b/zfs/lib/libdmu-ctl/zfs_log.c deleted file mode 100644 index 364385808..000000000 --- a/zfs/lib/libdmu-ctl/zfs_log.c +++ /dev/null @@ -1,693 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_log.c 1.13 08/04/09 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/cmn_err.h> -#include <sys/kmem.h> -#include <sys/thread.h> -#include <sys/file.h> -#include <sys/vfs.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_dir.h> -#include <sys/zil.h> -#include <sys/zil_impl.h> -#include <sys/byteorder.h> -#include <sys/policy.h> -#include <sys/stat.h> -#include <sys/mode.h> -#include <sys/acl.h> -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/zfs_fuid.h> -#include <sys/ddi.h> - -/* - * All the functions in this file are used to construct the log entries - * to record transactions. They allocate * an intent log transaction - * structure (itx_t) and save within it all the information necessary to - * possibly replay the transaction. The itx is then assigned a sequence - * number and inserted in the in-memory list anchored in the zilog. - */ - -int -zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) -{ - int isxvattr = (vap->va_mask & AT_XVATTR); - switch (type) { - case Z_FILE: - if (vsecp == NULL && !isxvattr) - return (TX_CREATE); - if (vsecp && isxvattr) - return (TX_CREATE_ACL_ATTR); - if (vsecp) - return (TX_CREATE_ACL); - else - return (TX_CREATE_ATTR); - /*NOTREACHED*/ - case Z_DIR: - if (vsecp == NULL && !isxvattr) - return (TX_MKDIR); - if (vsecp && isxvattr) - return (TX_MKDIR_ACL_ATTR); - if (vsecp) - return (TX_MKDIR_ACL); - else - return (TX_MKDIR_ATTR); - case Z_XATTRDIR: - return (TX_MKXATTR); - } - ASSERT(0); - return (TX_MAX_TYPE); -} - -/* - * build up the log data necessary for logging xvattr_t - * First lr_attr_t is initialized. following the lr_attr_t - * is the mapsize and attribute bitmap copied from the xvattr_t. - * Following the bitmap and bitmapsize two 64 bit words are reserved - * for the create time which may be set. Following the create time - * records a single 64 bit integer which has the bits to set on - * replay for the xvattr. - */ -static void -zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) -{ - uint32_t *bitmap; - uint64_t *attrs; - uint64_t *crtime; - xoptattr_t *xoap; - void *scanstamp; - int i; - - xoap = xva_getxoptattr(xvap); - ASSERT(xoap); - - lrattr->lr_attr_masksize = xvap->xva_mapsize; - bitmap = &lrattr->lr_attr_bitmap; - for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { - *bitmap = xvap->xva_reqattrmap[i]; - } - - /* Now pack the attributes up in a single uint64_t */ - attrs = (uint64_t *)bitmap; - crtime = attrs + 1; - scanstamp = (caddr_t)(crtime + 2); - *attrs = 0; - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) - *attrs |= (xoap->xoa_readonly == 0) ? 0 : - XAT0_READONLY; - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) - *attrs |= (xoap->xoa_hidden == 0) ? 0 : - XAT0_HIDDEN; - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) - *attrs |= (xoap->xoa_system == 0) ? 0 : - XAT0_SYSTEM; - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) - *attrs |= (xoap->xoa_archive == 0) ? 0 : - XAT0_ARCHIVE; - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) - *attrs |= (xoap->xoa_immutable == 0) ? 0 : - XAT0_IMMUTABLE; - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) - *attrs |= (xoap->xoa_nounlink == 0) ? 0 : - XAT0_NOUNLINK; - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) - *attrs |= (xoap->xoa_appendonly == 0) ? 0 : - XAT0_APPENDONLY; - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) - *attrs |= (xoap->xoa_opaque == 0) ? 0 : - XAT0_APPENDONLY; - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) - *attrs |= (xoap->xoa_nodump == 0) ? 0 : - XAT0_NODUMP; - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) - *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : - XAT0_AV_QUARANTINED; - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) - *attrs |= (xoap->xoa_av_modified == 0) ? 0 : - XAT0_AV_MODIFIED; - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) - ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); -} - -static void * -zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) -{ - zfs_fuid_t *zfuid; - uint64_t *fuidloc = start; - - /* First copy in the ACE FUIDs */ - for (zfuid = list_head(&fuidp->z_fuids); zfuid; - zfuid = list_next(&fuidp->z_fuids, zfuid)) { - *fuidloc++ = zfuid->z_logfuid; - } - return (fuidloc); -} - - -static void * -zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) -{ - zfs_fuid_domain_t *zdomain; - - /* now copy in the domain info, if any */ - if (fuidp->z_domain_str_sz != 0) { - for (zdomain = list_head(&fuidp->z_domains); zdomain; - zdomain = list_next(&fuidp->z_domains, zdomain)) { - bcopy((void *)zdomain->z_domain, start, - strlen(zdomain->z_domain) + 1); - start = (caddr_t)start + - strlen(zdomain->z_domain) + 1; - } - } - return (start); -} - -/* - * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, - * TX_MKDIR_ATTR and TX_MKXATTR - * transactions. - * - * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID - * domain information appended prior to the name. In this case the - * uid/gid in the log record will be a log centric FUID. - * - * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that - * may contain attributes, ACL and optional fuid information. - * - * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify - * and ACL and normal users/groups in the ACEs. - * - * There may be an optional xvattr attribute information similar - * to zfs_log_setattr. - * - * Also, after the file name "domain" strings may be appended. - */ -void -zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, - zfs_fuid_info_t *fuidp, vattr_t *vap) -{ - itx_t *itx; - uint64_t seq; - lr_create_t *lr; - lr_acl_create_t *lracl; - size_t aclsize; - size_t xvatsize = 0; - size_t txsize; - xvattr_t *xvap = (xvattr_t *)vap; - void *end; - size_t lrsize; - - size_t namesize = strlen(name) + 1; - size_t fuidsz = 0; - - if (zilog == NULL) - return; - - /* - * If we have FUIDs present then add in space for - * domains and ACE fuid's if any. - */ - if (fuidp) { - fuidsz += fuidp->z_domain_str_sz; - fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); - } - - if (vap->va_mask & AT_XVATTR) - xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); - - if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || - (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || - (int)txtype == TX_MKXATTR) { - txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; - lrsize = sizeof (*lr); - } else { - aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; - txsize = - sizeof (lr_acl_create_t) + namesize + fuidsz + - ZIL_ACE_LENGTH(aclsize) + xvatsize; - lrsize = sizeof (lr_acl_create_t); - } - - itx = zil_itx_create(txtype, txsize); - - lr = (lr_create_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { - lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; - } else { - lr->lr_uid = fuidp->z_fuid_owner; - } - if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { - lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; - } else { - lr->lr_gid = fuidp->z_fuid_group; - } - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - lr->lr_rdev = zp->z_phys->zp_rdev; - - /* - * Fill in xvattr info if any - */ - if (vap->va_mask & AT_XVATTR) { - zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); - end = (caddr_t)lr + lrsize + xvatsize; - } else { - end = (caddr_t)lr + lrsize; - } - - /* Now fill in any ACL info */ - - if (vsecp) { - lracl = (lr_acl_create_t *)&itx->itx_lr; - lracl->lr_aclcnt = vsecp->vsa_aclcnt; - lracl->lr_acl_bytes = aclsize; - lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; - lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; - if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) - lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; - else - lracl->lr_acl_flags = 0; - - bcopy(vsecp->vsa_aclentp, end, aclsize); - end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); - } - - /* drop in FUID info */ - if (fuidp) { - end = zfs_log_fuid_ids(fuidp, end); - end = zfs_log_fuid_domains(fuidp, end); - } - /* - * Now place file name in log record - */ - bcopy(name, end, namesize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; -} - -/* - * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. - */ -void -zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name) -{ - itx_t *itx; - uint64_t seq; - lr_remove_t *lr; - size_t namesize = strlen(name) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_remove_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - bcopy(name, (char *)(lr + 1), namesize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; -} - -/* - * zfs_log_link() handles TX_LINK transactions. - */ -void -zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name) -{ - itx_t *itx; - uint64_t seq; - lr_link_t *lr; - size_t namesize = strlen(name) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_link_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_link_obj = zp->z_id; - bcopy(name, (char *)(lr + 1), namesize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; -} - -/* - * zfs_log_symlink() handles TX_SYMLINK transactions. - */ -void -zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, char *link) -{ - itx_t *itx; - uint64_t seq; - lr_create_t *lr; - size_t namesize = strlen(name) + 1; - size_t linksize = strlen(link) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); - lr = (lr_create_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - bcopy(name, (char *)(lr + 1), namesize); - bcopy(link, (char *)(lr + 1) + namesize, linksize); - - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; -} - -/* - * zfs_log_rename() handles TX_RENAME transactions. - */ -void -zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) -{ - itx_t *itx; - uint64_t seq; - lr_rename_t *lr; - size_t snamesize = strlen(sname) + 1; - size_t dnamesize = strlen(dname) + 1; - - if (zilog == NULL) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); - lr = (lr_rename_t *)&itx->itx_lr; - lr->lr_sdoid = sdzp->z_id; - lr->lr_tdoid = tdzp->z_id; - bcopy(sname, (char *)(lr + 1), snamesize); - bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); - - seq = zil_itx_assign(zilog, itx, tx); - sdzp->z_last_itx = seq; - tdzp->z_last_itx = seq; - szp->z_last_itx = seq; -} - -/* - * zfs_log_write() handles TX_WRITE transactions. - */ -ssize_t zfs_immediate_write_sz = 32768; - -#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ - sizeof (lr_write_t)) - -void -zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag) -{ - itx_wr_state_t write_state; - boolean_t slogging; - uintptr_t fsync_cnt; - - if (zilog == NULL || zp->z_unlinked) - return; - - /* - * Writes are handled in three different ways: - * - * WR_INDIRECT: - * If the write is greater than zfs_immediate_write_sz and there are - * no separate logs in this pool then later *if* we need to log the - * write then dmu_sync() is used to immediately write the block and - * its block pointer is put in the log record. - * WR_COPIED: - * If we know we'll immediately be committing the - * transaction (FSYNC or FDSYNC), the we allocate a larger - * log record here for the data and copy the data in. - * WR_NEED_COPY: - * Otherwise we don't allocate a buffer, and *if* we need to - * flush the write later then a buffer is allocated and - * we retrieve the data using the dmu. - */ - slogging = spa_has_slogs(zilog->zl_spa); - if (resid > zfs_immediate_write_sz && !slogging) - write_state = WR_INDIRECT; - else if (ioflag & (FSYNC | FDSYNC)) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; - - if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { - (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); - } - - while (resid) { - itx_t *itx; - lr_write_t *lr; - ssize_t len; - - /* - * If there are slogs and the write would overflow the largest - * block, then because we don't want to use the main pool - * to dmu_sync, we have to split the write. - */ - if (slogging && resid > ZIL_MAX_LOG_DATA) - len = SPA_MAXBLOCKSIZE >> 1; - else - len = resid; - - itx = zil_itx_create(txtype, sizeof (*lr) + - (write_state == WR_COPIED ? len : 0)); - lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1) != 0) { - kmem_free(itx, offsetof(itx_t, itx_lr) + - itx->itx_lr.lrc_reclen); - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; - } - - itx->itx_wr_state = write_state; - if (write_state == WR_NEED_COPY) - itx->itx_sod += len; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); - - itx->itx_private = zp->z_zfsvfs; - - if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || - (ioflag & (FSYNC | FDSYNC))) - itx->itx_sync = B_TRUE; - else - itx->itx_sync = B_FALSE; - - zp->z_last_itx = zil_itx_assign(zilog, itx, tx); - - off += len; - resid -= len; - } -} - -/* - * zfs_log_truncate() handles TX_TRUNCATE transactions. - */ -void -zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len) -{ - itx_t *itx; - uint64_t seq; - lr_truncate_t *lr; - - if (zilog == NULL || zp->z_unlinked) - return; - - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_truncate_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} - -/* - * zfs_log_setattr() handles TX_SETATTR transactions. - */ -void -zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) -{ - itx_t *itx; - uint64_t seq; - lr_setattr_t *lr; - xvattr_t *xvap = (xvattr_t *)vap; - size_t recsize = sizeof (lr_setattr_t); - void *start; - - - if (zilog == NULL || zp->z_unlinked) - return; - - /* - * If XVATTR set, then log record size needs to allow - * for lr_attr_t + xvattr mask, mapsize and create time - * plus actual attribute values - */ - if (vap->va_mask & AT_XVATTR) - recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); - - if (fuidp) - recsize += fuidp->z_domain_str_sz; - - itx = zil_itx_create(txtype, recsize); - lr = (lr_setattr_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_mask = (uint64_t)mask_applied; - lr->lr_mode = (uint64_t)vap->va_mode; - if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) - lr->lr_uid = fuidp->z_fuid_owner; - else - lr->lr_uid = (uint64_t)vap->va_uid; - - if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) - lr->lr_gid = fuidp->z_fuid_group; - else - lr->lr_gid = (uint64_t)vap->va_gid; - - lr->lr_size = (uint64_t)vap->va_size; - ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); - ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); - start = (lr_setattr_t *)(lr + 1); - if (vap->va_mask & AT_XVATTR) { - zfs_log_xvattr((lr_attr_t *)start, xvap); - start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); - } - - /* - * Now stick on domain information if any on end - */ - - if (fuidp) - (void) zfs_log_fuid_domains(fuidp, start); - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} - -/* - * zfs_log_acl() handles TX_ACL transactions. - */ -void -zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, - vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) -{ - itx_t *itx; - uint64_t seq; - lr_acl_v0_t *lrv0; - lr_acl_t *lr; - int txtype; - int lrsize; - size_t txsize; - size_t aclbytes = vsecp->vsa_aclentsz; - - txtype = (zp->z_zfsvfs->z_version == ZPL_VERSION_INITIAL) ? - TX_ACL_V0 : TX_ACL; - - if (txtype == TX_ACL) - lrsize = sizeof (*lr); - else - lrsize = sizeof (*lrv0); - - if (zilog == NULL || zp->z_unlinked) - return; - - txsize = lrsize + - ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + - (fuidp ? fuidp->z_domain_str_sz : 0) + - sizeof (uint64) * (fuidp ? fuidp->z_fuid_cnt : 0); - - itx = zil_itx_create(txtype, txsize); - - lr = (lr_acl_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - if (txtype == TX_ACL) { - lr->lr_acl_bytes = aclbytes; - lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; - lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; - if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) - lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; - else - lr->lr_acl_flags = 0; - } - lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; - - if (txtype == TX_ACL_V0) { - lrv0 = (lr_acl_v0_t *)lr; - bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); - } else { - void *start = (ace_t *)(lr + 1); - - bcopy(vsecp->vsa_aclentp, start, aclbytes); - - start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); - - if (fuidp) { - start = zfs_log_fuid_ids(fuidp, start); - (void) zfs_log_fuid_domains(fuidp, start); - } - } - - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; -} diff --git a/zfs/lib/libdmu-ctl/zfs_replay.c b/zfs/lib/libdmu-ctl/zfs_replay.c deleted file mode 100644 index ca9990d7c..000000000 --- a/zfs/lib/libdmu-ctl/zfs_replay.c +++ /dev/null @@ -1,876 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_replay.c 1.7 08/01/14 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/cmn_err.h> -#include <sys/kmem.h> -#include <sys/thread.h> -#include <sys/file.h> -#include <sys/fcntl.h> -#include <sys/vfs.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_fuid.h> -#include <sys/spa.h> -#include <sys/zil.h> -#include <sys/byteorder.h> -#include <sys/stat.h> -#include <sys/mode.h> -#include <sys/acl.h> -#include <sys/atomic.h> -#include <sys/cred.h> - -/* - * Functions to replay ZFS intent log (ZIL) records - * The functions are called through a function vector (zfs_replay_vector) - * which is indexed by the transaction type. - */ - -static void -zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, - uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) -{ - bzero(vap, sizeof (*vap)); - vap->va_mask = (uint_t)mask; - vap->va_type = IFTOVT(mode); - vap->va_mode = mode & MODEMASK; - vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; - vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; - vap->va_rdev = zfs_cmpldev(rdev); - vap->va_nodeid = nodeid; -} - -/* ARGSUSED */ -static int -zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) -{ - return (ENOTSUP); -} - -static void -zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) -{ - xoptattr_t *xoap = NULL; - uint64_t *attrs; - uint64_t *crtime; - uint32_t *bitmap; - void *scanstamp; - int i; - - xvap->xva_vattr.va_mask |= AT_XVATTR; - if ((xoap = xva_getxoptattr(xvap)) == NULL) { - xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ - return; - } - - ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); - - bitmap = &lrattr->lr_attr_bitmap; - for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) - xvap->xva_reqattrmap[i] = *bitmap; - - attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); - crtime = attrs + 1; - scanstamp = (caddr_t)(crtime + 2); - - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) - xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) - xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) - xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) - xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) - xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) - xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) - xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) - xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) - xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) - xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) - xoap->xoa_av_quarantined = - ((*attrs & XAT0_AV_QUARANTINED) != 0); - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) - ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); -} - -static int -zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) -{ - uint64_t uid_idx; - uint64_t gid_idx; - int domcnt = 0; - - uid_idx = FUID_INDEX(uid); - gid_idx = FUID_INDEX(gid); - if (uid_idx) - domcnt++; - if (gid_idx > 0 && gid_idx != uid_idx) - domcnt++; - - return (domcnt); -} - -static void * -zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, - int domcnt) -{ - int i; - - for (i = 0; i != domcnt; i++) { - fuid_infop->z_domain_table[i] = start; - start = (caddr_t)start + strlen(start) + 1; - } - - return (start); -} - -/* - * Set the uid/gid in the fuid_info structure. - */ -static void -zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) -{ - /* - * If owner or group are log specific FUIDs then slurp up - * domain information and build zfs_fuid_info_t - */ - if (IS_EPHEMERAL(uid)) - fuid_infop->z_fuid_owner = uid; - - if (IS_EPHEMERAL(gid)) - fuid_infop->z_fuid_group = gid; -} - -/* - * Load fuid domains into fuid_info_t - */ -static zfs_fuid_info_t * -zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) -{ - int domcnt; - - zfs_fuid_info_t *fuid_infop; - - fuid_infop = zfs_fuid_info_alloc(); - - domcnt = zfs_replay_domain_cnt(uid, gid); - - if (domcnt == 0) - return (fuid_infop); - - fuid_infop->z_domain_table = - kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); - - zfs_replay_fuid_ugid(fuid_infop, uid, gid); - - fuid_infop->z_domain_cnt = domcnt; - *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); - return (fuid_infop); -} - -/* - * load zfs_fuid_t's and fuid_domains into fuid_info_t - */ -static zfs_fuid_info_t * -zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, - uint64_t gid) -{ - uint64_t *log_fuid = (uint64_t *)start; - zfs_fuid_info_t *fuid_infop; - int i; - - fuid_infop = zfs_fuid_info_alloc(); - fuid_infop->z_domain_cnt = domcnt; - - fuid_infop->z_domain_table = - kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); - - for (i = 0; i != idcnt; i++) { - zfs_fuid_t *zfuid; - - zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); - zfuid->z_logfuid = *log_fuid; - zfuid->z_id = -1; - zfuid->z_domidx = 0; - list_insert_tail(&fuid_infop->z_fuids, zfuid); - log_fuid++; - } - - zfs_replay_fuid_ugid(fuid_infop, uid, gid); - - *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); - return (fuid_infop); -} - -static void -zfs_replay_swap_attrs(lr_attr_t *lrattr) -{ - /* swap the lr_attr structure */ - byteswap_uint32_array(lrattr, sizeof (*lrattr)); - /* swap the bitmap */ - byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * - sizeof (uint32_t)); - /* swap the attributes, create time + 64 bit word for attributes */ - byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * - (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); -} - -/* - * Replay file create with optional ACL, xvattr information as well - * as option FUID information. - */ -static int -zfs_replay_create_acl(zfsvfs_t *zfsvfs, - lr_acl_create_t *lracl, boolean_t byteswap) -{ - char *name = NULL; /* location determined later */ - lr_create_t *lr = (lr_create_t *)lracl; - znode_t *dzp; - vnode_t *vp = NULL; - xvattr_t xva; - int vflg = 0; - vsecattr_t vsec = { 0 }; - lr_attr_t *lrattr; - void *aclstart; - void *fuidstart; - size_t xvatlen = 0; - uint64_t txtype; - int error; - - if (byteswap) { - byteswap_uint64_array(lracl, sizeof (*lracl)); - txtype = (int)lr->lr_common.lrc_txtype; - if (txtype == TX_CREATE_ACL_ATTR || - txtype == TX_MKDIR_ACL_ATTR) { - lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); - zfs_replay_swap_attrs(lrattr); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - } - - aclstart = (caddr_t)(lracl + 1) + xvatlen; - zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); - /* swap fuids */ - if (lracl->lr_fuidcnt) { - byteswap_uint64_array((caddr_t)aclstart + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes), - lracl->lr_fuidcnt * sizeof (uint64_t)); - } - } - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - xva_init(&xva); - zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); - - /* - * All forms of zfs create (create, mkdir, mkxattrdir, symlink) - * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic VOP_CREATE() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. - */ - ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); - xva.xva_vattr.va_nblocks = lr->lr_gen; - - error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); - if (error != ENOENT) - goto bail; - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - switch ((int)lr->lr_common.lrc_txtype) { - case TX_CREATE_ACL: - aclstart = (caddr_t)(lracl + 1); - fuidstart = (caddr_t)aclstart + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - /*FALLTHROUGH*/ - case TX_CREATE_ACL_ATTR: - if (name == NULL) { - lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - xva.xva_vattr.va_mask |= AT_XVATTR; - zfs_replay_xvattr(lrattr, &xva); - } - vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; - vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; - vsec.vsa_aclcnt = lracl->lr_aclcnt; - vsec.vsa_aclentsz = lracl->lr_acl_bytes; - vsec.vsa_aclflags = lracl->lr_acl_flags; - if (zfsvfs->z_fuid_replay == NULL) { - fuidstart = (caddr_t)(lracl + 1) + xvatlen + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = - zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - } - - error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, - 0, 0, &vp, kcred, vflg, NULL, &vsec); - break; - case TX_MKDIR_ACL: - aclstart = (caddr_t)(lracl + 1); - fuidstart = (caddr_t)aclstart + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - /*FALLTHROUGH*/ - case TX_MKDIR_ACL_ATTR: - if (name == NULL) { - lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - zfs_replay_xvattr(lrattr, &xva); - } - vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; - vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; - vsec.vsa_aclcnt = lracl->lr_aclcnt; - vsec.vsa_aclentsz = lracl->lr_acl_bytes; - vsec.vsa_aclflags = lracl->lr_acl_flags; - if (zfsvfs->z_fuid_replay == NULL) { - fuidstart = (caddr_t)(lracl + 1) + xvatlen + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = - zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - } - error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, - &vp, kcred, NULL, vflg, &vsec); - break; - default: - error = ENOTSUP; - } - -bail: - if (error == 0 && vp != NULL) - VN_RELE(vp); - - VN_RELE(ZTOV(dzp)); - - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - zfsvfs->z_fuid_replay = NULL; - - return (error); -} - -static int -zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) -{ - char *name = NULL; /* location determined later */ - char *link; /* symlink content follows name */ - znode_t *dzp; - vnode_t *vp = NULL; - xvattr_t xva; - int vflg = 0; - size_t lrsize = sizeof (lr_create_t); - lr_attr_t *lrattr; - void *start; - size_t xvatlen; - uint64_t txtype; - int error; - - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - txtype = (int)lr->lr_common.lrc_txtype; - if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) - zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); - } - - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - xva_init(&xva); - zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); - - /* - * All forms of zfs create (create, mkdir, mkxattrdir, symlink) - * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic VOP_CREATE() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. - */ - ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); - xva.xva_vattr.va_nblocks = lr->lr_gen; - - error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); - if (error != ENOENT) - goto out; - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - - /* - * Symlinks don't have fuid info, and CIFS never creates - * symlinks. - * - * The _ATTR versions will grab the fuid info in their subcases. - */ - if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && - (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && - (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { - start = (lr + 1); - zfsvfs->z_fuid_replay = - zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - } - - switch ((int)lr->lr_common.lrc_txtype) { - case TX_CREATE_ATTR: - lrattr = (lr_attr_t *)(caddr_t)(lr + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); - start = (caddr_t)(lr + 1) + xvatlen; - zfsvfs->z_fuid_replay = - zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - name = (char *)start; - - /*FALLTHROUGH*/ - case TX_CREATE: - if (name == NULL) - name = (char *)start; - - error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, - 0, 0, &vp, kcred, vflg, NULL, NULL); - break; - case TX_MKDIR_ATTR: - lrattr = (lr_attr_t *)(caddr_t)(lr + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); - start = (caddr_t)(lr + 1) + xvatlen; - zfsvfs->z_fuid_replay = - zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - name = (char *)start; - - /*FALLTHROUGH*/ - case TX_MKDIR: - if (name == NULL) - name = (char *)(lr + 1); - - error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, - &vp, kcred, NULL, vflg, NULL); - break; - case TX_MKXATTR: - name = (char *)(lr + 1); - error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); - break; - case TX_SYMLINK: - name = (char *)(lr + 1); - link = name + strlen(name) + 1; - error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr, - link, kcred, NULL, vflg); - break; - default: - error = ENOTSUP; - } - -out: - if (error == 0 && vp != NULL) - VN_RELE(vp); - - VN_RELE(ZTOV(dzp)); - - if (zfsvfs->z_fuid_replay) - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - zfsvfs->z_fuid_replay = NULL; - return (error); -} - -static int -zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) -{ - char *name = (char *)(lr + 1); /* name follows lr_remove_t */ - znode_t *dzp; - int error; - int vflg = 0; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - - switch ((int)lr->lr_common.lrc_txtype) { - case TX_REMOVE: - error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg); - break; - case TX_RMDIR: - error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg); - break; - default: - error = ENOTSUP; - } - - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) -{ - char *name = (char *)(lr + 1); /* name follows lr_link_t */ - znode_t *dzp, *zp; - int error; - int vflg = 0; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { - VN_RELE(ZTOV(dzp)); - return (error); - } - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - - error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg); - - VN_RELE(ZTOV(zp)); - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) -{ - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; - znode_t *sdzp, *tdzp; - int error; - int vflg = 0; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) - return (error); - - if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { - VN_RELE(ZTOV(sdzp)); - return (error); - } - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - - error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred, - NULL, vflg); - - VN_RELE(ZTOV(tdzp)); - VN_RELE(ZTOV(sdzp)); - - return (error); -} - -static int -zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) -{ - char *data = (char *)(lr + 1); /* data follows lr_write_t */ - znode_t *zp; - int error; - ssize_t resid; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log writes out of order, it's possible the - * file has been removed. In this case just drop the write - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, - lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - - VN_RELE(ZTOV(zp)); - - return (error); -} - -static int -zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) -{ - znode_t *zp; - flock64_t fl; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log truncates out of order, it's possible the - * file has been removed. In this case just drop the truncate - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - bzero(&fl, sizeof (fl)); - fl.l_type = F_WRLCK; - fl.l_whence = 0; - fl.l_start = lr->lr_offset; - fl.l_len = lr->lr_length; - - error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, - lr->lr_offset, kcred, NULL); - - VN_RELE(ZTOV(zp)); - - return (error); -} - -static int -zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) -{ - znode_t *zp; - xvattr_t xva; - vattr_t *vap = &xva.xva_vattr; - int error; - void *start; - - xva_init(&xva); - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((lr->lr_mask & AT_XVATTR) && - zfsvfs->z_version >= ZPL_VERSION_INITIAL) - zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log setattrs out of order, it's possible the - * file has been removed. In this case just drop the setattr - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, - lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); - - vap->va_size = lr->lr_size; - ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); - ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); - - /* - * Fill in xvattr_t portions if necessary. - */ - - start = (lr_setattr_t *)(lr + 1); - if (vap->va_mask & AT_XVATTR) { - zfs_replay_xvattr((lr_attr_t *)start, &xva); - start = (caddr_t)start + - ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); - } else - xva.xva_vattr.va_mask &= ~AT_XVATTR; - - zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - - error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL); - - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - zfsvfs->z_fuid_replay = NULL; - VN_RELE(ZTOV(zp)); - - return (error); -} - -static int -zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) -{ - ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ - vsecattr_t vsa; - znode_t *zp; - int error; - - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - zfs_oldace_byteswap(ace, lr->lr_aclcnt); - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - bzero(&vsa, sizeof (vsa)); - vsa.vsa_mask = VSA_ACE | VSA_ACECNT; - vsa.vsa_aclcnt = lr->lr_aclcnt; - vsa.vsa_aclentp = ace; - - error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); - - VN_RELE(ZTOV(zp)); - - return (error); -} - -/* - * Replaying ACLs is complicated by FUID support. - * The log record may contain some optional data - * to be used for replaying FUID's. These pieces - * are the actual FUIDs that were created initially. - * The FUID table index may no longer be valid and - * during zfs_create() a new index may be assigned. - * Because of this the log will contain the original - * doman+rid in order to create a new FUID. - * - * The individual ACEs may contain an ephemeral uid/gid which is no - * longer valid and will need to be replaced with an actual FUID. - * - */ -static int -zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) -{ - ace_t *ace = (ace_t *)(lr + 1); - vsecattr_t vsa; - znode_t *zp; - int error; - - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); - if (lr->lr_fuidcnt) { - byteswap_uint64_array((caddr_t)ace + - ZIL_ACE_LENGTH(lr->lr_acl_bytes), - lr->lr_fuidcnt * sizeof (uint64_t)); - } - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - bzero(&vsa, sizeof (vsa)); - vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; - vsa.vsa_aclcnt = lr->lr_aclcnt; - vsa.vsa_aclentp = ace; - vsa.vsa_aclentsz = lr->lr_acl_bytes; - vsa.vsa_aclflags = lr->lr_acl_flags; - - if (lr->lr_fuidcnt) { - void *fuidstart = (caddr_t)ace + - ZIL_ACE_LENGTH(lr->lr_acl_bytes); - - zfsvfs->z_fuid_replay = - zfs_replay_fuids(fuidstart, &fuidstart, - lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); - } - - error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); - - if (zfsvfs->z_fuid_replay) - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - - zfsvfs->z_fuid_replay = NULL; - VN_RELE(ZTOV(zp)); - - return (error); -} - -/* - * Callback vectors for replaying records - */ -zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { - zfs_replay_error, /* 0 no such transaction type */ - zfs_replay_create, /* TX_CREATE */ - zfs_replay_create, /* TX_MKDIR */ - zfs_replay_create, /* TX_MKXATTR */ - zfs_replay_create, /* TX_SYMLINK */ - zfs_replay_remove, /* TX_REMOVE */ - zfs_replay_remove, /* TX_RMDIR */ - zfs_replay_link, /* TX_LINK */ - zfs_replay_rename, /* TX_RENAME */ - zfs_replay_write, /* TX_WRITE */ - zfs_replay_truncate, /* TX_TRUNCATE */ - zfs_replay_setattr, /* TX_SETATTR */ - zfs_replay_acl_v0, /* TX_ACL_V0 */ - zfs_replay_acl, /* TX_ACL */ - zfs_replay_create_acl, /* TX_CREATE_ACL */ - zfs_replay_create, /* TX_CREATE_ATTR */ - zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ - zfs_replay_create_acl, /* TX_MKDIR_ACL */ - zfs_replay_create, /* TX_MKDIR_ATTR */ - zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ -}; diff --git a/zfs/lib/libdmu-ctl/zfs_rlock.c b/zfs/lib/libdmu-ctl/zfs_rlock.c deleted file mode 100644 index 44ec73b5d..000000000 --- a/zfs/lib/libdmu-ctl/zfs_rlock.c +++ /dev/null @@ -1,602 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_rlock.c 1.4 07/08/08 SMI" - -/* - * This file contains the code to implement file range locking in - * ZFS, although there isn't much specific to ZFS (all that comes to mind - * support for growing the blocksize). - * - * Interface - * --------- - * Defined in zfs_rlock.h but essentially: - * rl = zfs_range_lock(zp, off, len, lock_type); - * zfs_range_unlock(rl); - * zfs_range_reduce(rl, off, len); - * - * AVL tree - * -------- - * An AVL tree is used to maintain the state of the existing ranges - * that are locked for exclusive (writer) or shared (reader) use. - * The starting range offset is used for searching and sorting the tree. - * - * Common case - * ----------- - * The (hopefully) usual case is of no overlaps or contention for - * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree - * searched that finds no overlap, and *this* rl_t is placed in the tree. - * - * Overlaps/Reference counting/Proxy locks - * --------------------------------------- - * The avl code only allows one node at a particular offset. Also it's very - * inefficient to search through all previous entries looking for overlaps - * (because the very 1st in the ordered list might be at offset 0 but - * cover the whole file). - * So this implementation uses reference counts and proxy range locks. - * Firstly, only reader locks use reference counts and proxy locks, - * because writer locks are exclusive. - * When a reader lock overlaps with another then a proxy lock is created - * for that range and replaces the original lock. If the overlap - * is exact then the reference count of the proxy is simply incremented. - * Otherwise, the proxy lock is split into smaller lock ranges and - * new proxy locks created for non overlapping ranges. - * The reference counts are adjusted accordingly. - * Meanwhile, the orginal lock is kept around (this is the callers handle) - * and its offset and length are used when releasing the lock. - * - * Thread coordination - * ------------------- - * In order to make wakeups efficient and to ensure multiple continuous - * readers on a range don't starve a writer for the same range lock, - * two condition variables are allocated in each rl_t. - * If a writer (or reader) can't get a range it initialises the writer - * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; - * and waits on that cv. When a thread unlocks that range it wakes up all - * writers then all readers before destroying the lock. - * - * Append mode writes - * ------------------ - * Append mode writes need to lock a range at the end of a file. - * The offset of the end of the file is determined under the - * range locking mutex, and the lock type converted from RL_APPEND to - * RL_WRITER and the range locked. - * - * Grow block handling - * ------------------- - * ZFS supports multiple block sizes currently upto 128K. The smallest - * block size is used for the file which is grown as needed. During this - * growth all other writers and readers must be excluded. - * So if the block size needs to be grown then the whole file is - * exclusively locked, then later the caller will reduce the lock - * range to just the range to be written using zfs_reduce_range. - */ - -#include <sys/zfs_rlock.h> - -/* - * Check if a write lock can be grabbed, or wait and recheck until available. - */ -static void -zfs_range_lock_writer(znode_t *zp, rl_t *new) -{ - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl; - avl_index_t where; - uint64_t end_size; - uint64_t off = new->r_off; - uint64_t len = new->r_len; - - for (;;) { - /* - * Range locking is also used by zvol and uses a - * dummied up znode. However, for zvol, we don't need to - * append or grow blocksize, and besides we don't have - * a z_phys or z_zfsvfs - so skip that processing. - * - * Yes, this is ugly, and would be solved by not handling - * grow or append in range lock code. If that was done then - * we could make the range locking code generically available - * to other non-zfs consumers. - */ - if (zp->z_vnode) { /* caller is ZPL */ - /* - * If in append mode pick up the current end of file. - * This is done under z_range_lock to avoid races. - */ - if (new->r_type == RL_APPEND) - new->r_off = zp->z_phys->zp_size; - - /* - * If we need to grow the block size then grab the whole - * file range. This is also done under z_range_lock to - * avoid races. - */ - end_size = MAX(zp->z_phys->zp_size, new->r_off + len); - if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || - zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { - new->r_off = 0; - new->r_len = UINT64_MAX; - } - } - - /* - * First check for the usual case of no locks - */ - if (avl_numnodes(tree) == 0) { - new->r_type = RL_WRITER; /* convert to writer */ - avl_add(tree, new); - return; - } - - /* - * Look for any locks in the range. - */ - rl = avl_find(tree, new, &where); - if (rl) - goto wait; /* already locked at same offset */ - - rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - if (rl && (rl->r_off < new->r_off + new->r_len)) - goto wait; - - rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - if (rl && rl->r_off + rl->r_len > new->r_off) - goto wait; - - new->r_type = RL_WRITER; /* convert possible RL_APPEND */ - avl_insert(tree, new, where); - return; -wait: - if (!rl->r_write_wanted) { - cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); - rl->r_write_wanted = B_TRUE; - } - cv_wait(&rl->r_wr_cv, &zp->z_range_lock); - - /* reset to original */ - new->r_off = off; - new->r_len = len; - } -} - -/* - * If this is an original (non-proxy) lock then replace it by - * a proxy and return the proxy. - */ -static rl_t * -zfs_range_proxify(avl_tree_t *tree, rl_t *rl) -{ - rl_t *proxy; - - if (rl->r_proxy) - return (rl); /* already a proxy */ - - ASSERT3U(rl->r_cnt, ==, 1); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); - avl_remove(tree, rl); - rl->r_cnt = 0; - - /* create a proxy range lock */ - proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); - proxy->r_off = rl->r_off; - proxy->r_len = rl->r_len; - proxy->r_cnt = 1; - proxy->r_type = RL_READER; - proxy->r_proxy = B_TRUE; - proxy->r_write_wanted = B_FALSE; - proxy->r_read_wanted = B_FALSE; - avl_add(tree, proxy); - - return (proxy); -} - -/* - * Split the range lock at the supplied offset - * returning the *front* proxy. - */ -static rl_t * -zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) -{ - rl_t *front, *rear; - - ASSERT3U(rl->r_len, >, 1); - ASSERT3U(off, >, rl->r_off); - ASSERT3U(off, <, rl->r_off + rl->r_len); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); - - /* create the rear proxy range lock */ - rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rear->r_off = off; - rear->r_len = rl->r_off + rl->r_len - off; - rear->r_cnt = rl->r_cnt; - rear->r_type = RL_READER; - rear->r_proxy = B_TRUE; - rear->r_write_wanted = B_FALSE; - rear->r_read_wanted = B_FALSE; - - front = zfs_range_proxify(tree, rl); - front->r_len = off - rl->r_off; - - avl_insert_here(tree, rear, front, AVL_AFTER); - return (front); -} - -/* - * Create and add a new proxy range lock for the supplied range. - */ -static void -zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) -{ - rl_t *rl; - - ASSERT(len); - rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rl->r_off = off; - rl->r_len = len; - rl->r_cnt = 1; - rl->r_type = RL_READER; - rl->r_proxy = B_TRUE; - rl->r_write_wanted = B_FALSE; - rl->r_read_wanted = B_FALSE; - avl_add(tree, rl); -} - -static void -zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) -{ - rl_t *next; - uint64_t off = new->r_off; - uint64_t len = new->r_len; - - /* - * prev arrives either: - * - pointing to an entry at the same offset - * - pointing to the entry with the closest previous offset whose - * range may overlap with the new range - * - null, if there were no ranges starting before the new one - */ - if (prev) { - if (prev->r_off + prev->r_len <= off) { - prev = NULL; - } else if (prev->r_off != off) { - /* - * convert to proxy if needed then - * split this entry and bump ref count - */ - prev = zfs_range_split(tree, prev, off); - prev = AVL_NEXT(tree, prev); /* move to rear range */ - } - } - ASSERT((prev == NULL) || (prev->r_off == off)); - - if (prev) - next = prev; - else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - - if (next == NULL || off + len <= next->r_off) { - /* no overlaps, use the original new rl_t in the tree */ - avl_insert(tree, new, where); - return; - } - - if (off < next->r_off) { - /* Add a proxy for initial range before the overlap */ - zfs_range_new_proxy(tree, off, next->r_off - off); - } - - new->r_cnt = 0; /* will use proxies in tree */ - /* - * We now search forward through the ranges, until we go past the end - * of the new range. For each entry we make it a proxy if it - * isn't already, then bump its reference count. If there's any - * gaps between the ranges then we create a new proxy range. - */ - for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) - break; - if (prev && prev->r_off + prev->r_len < next->r_off) { - /* there's a gap */ - ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - next->r_off - (prev->r_off + prev->r_len)); - } - if (off + len == next->r_off + next->r_len) { - /* exact overlap with end */ - next = zfs_range_proxify(tree, next); - next->r_cnt++; - return; - } - if (off + len < next->r_off + next->r_len) { - /* new range ends in the middle of this block */ - next = zfs_range_split(tree, next, off + len); - next->r_cnt++; - return; - } - ASSERT3U(off + len, >, next->r_off + next->r_len); - next = zfs_range_proxify(tree, next); - next->r_cnt++; - } - - /* Add the remaining end range. */ - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - (off + len) - (prev->r_off + prev->r_len)); -} - -/* - * Check if a reader lock can be grabbed, or wait and recheck until available. - */ -static void -zfs_range_lock_reader(znode_t *zp, rl_t *new) -{ - avl_tree_t *tree = &zp->z_range_avl; - rl_t *prev, *next; - avl_index_t where; - uint64_t off = new->r_off; - uint64_t len = new->r_len; - - /* - * Look for any writer locks in the range. - */ -retry: - prev = avl_find(tree, new, &where); - if (prev == NULL) - prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - - /* - * Check the previous range for a writer lock overlap. - */ - if (prev && (off < prev->r_off + prev->r_len)) { - if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { - if (!prev->r_read_wanted) { - cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); - prev->r_read_wanted = B_TRUE; - } - cv_wait(&prev->r_rd_cv, &zp->z_range_lock); - goto retry; - } - if (off + len < prev->r_off + prev->r_len) - goto got_lock; - } - - /* - * Search through the following ranges to see if there's - * write lock any overlap. - */ - if (prev) - next = AVL_NEXT(tree, prev); - else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - for (; next; next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) - goto got_lock; - if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { - if (!next->r_read_wanted) { - cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); - next->r_read_wanted = B_TRUE; - } - cv_wait(&next->r_rd_cv, &zp->z_range_lock); - goto retry; - } - if (off + len <= next->r_off + next->r_len) - goto got_lock; - } - -got_lock: - /* - * Add the read lock, which may involve splitting existing - * locks and bumping ref counts (r_cnt). - */ - zfs_range_add_reader(tree, new, prev, where); -} - -/* - * Lock a range (offset, length) as either shared (RL_READER) - * or exclusive (RL_WRITER). Returns the range lock structure - * for later unlocking or reduce range (if entire file - * previously locked as RL_WRITER). - */ -rl_t * -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) -{ - rl_t *new; - - ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - - new = kmem_alloc(sizeof (rl_t), KM_SLEEP); - new->r_zp = zp; - new->r_off = off; - new->r_len = len; - new->r_cnt = 1; /* assume it's going to be in the tree */ - new->r_type = type; - new->r_proxy = B_FALSE; - new->r_write_wanted = B_FALSE; - new->r_read_wanted = B_FALSE; - - mutex_enter(&zp->z_range_lock); - if (type == RL_READER) { - /* - * First check for the usual case of no locks - */ - if (avl_numnodes(&zp->z_range_avl) == 0) - avl_add(&zp->z_range_avl, new); - else - zfs_range_lock_reader(zp, new); - } else - zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ - mutex_exit(&zp->z_range_lock); - return (new); -} - -/* - * Unlock a reader lock - */ -static void -zfs_range_unlock_reader(znode_t *zp, rl_t *remove) -{ - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next; - uint64_t len; - - /* - * The common case is when the remove entry is in the tree - * (cnt == 1) meaning there's been no other reader locks overlapping - * with this one. Otherwise the remove entry will have been - * removed from the tree and replaced by proxies (one or - * more ranges mapping to the entire range). - */ - if (remove->r_cnt == 1) { - avl_remove(tree, remove); - if (remove->r_write_wanted) { - cv_broadcast(&remove->r_wr_cv); - cv_destroy(&remove->r_wr_cv); - } - if (remove->r_read_wanted) { - cv_broadcast(&remove->r_rd_cv); - cv_destroy(&remove->r_rd_cv); - } - } else { - ASSERT3U(remove->r_cnt, ==, 0); - ASSERT3U(remove->r_write_wanted, ==, 0); - ASSERT3U(remove->r_read_wanted, ==, 0); - /* - * Find start proxy representing this reader lock, - * then decrement ref count on all proxies - * that make up this range, freeing them as needed. - */ - rl = avl_find(tree, remove, NULL); - ASSERT(rl); - ASSERT(rl->r_cnt); - ASSERT(rl->r_type == RL_READER); - for (len = remove->r_len; len != 0; rl = next) { - len -= rl->r_len; - if (len) { - next = AVL_NEXT(tree, rl); - ASSERT(next); - ASSERT(rl->r_off + rl->r_len == next->r_off); - ASSERT(next->r_cnt); - ASSERT(next->r_type == RL_READER); - } - rl->r_cnt--; - if (rl->r_cnt == 0) { - avl_remove(tree, rl); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); - } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); - } - kmem_free(rl, sizeof (rl_t)); - } - } - } - kmem_free(remove, sizeof (rl_t)); -} - -/* - * Unlock range and destroy range lock structure. - */ -void -zfs_range_unlock(rl_t *rl) -{ - znode_t *zp = rl->r_zp; - - ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); - ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); - ASSERT(!rl->r_proxy); - - mutex_enter(&zp->z_range_lock); - if (rl->r_type == RL_WRITER) { - /* writer locks can't be shared or split */ - avl_remove(&zp->z_range_avl, rl); - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); - } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); - } - kmem_free(rl, sizeof (rl_t)); - } else { - /* - * lock may be shared, let zfs_range_unlock_reader() - * release the lock and free the rl_t - */ - zfs_range_unlock_reader(zp, rl); - mutex_exit(&zp->z_range_lock); - } -} - -/* - * Reduce range locked as RL_WRITER from whole file to specified range. - * Asserts the whole file is exclusivly locked and so there's only one - * entry in the tree. - */ -void -zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) -{ - znode_t *zp = rl->r_zp; - - /* Ensure there are no other locks */ - ASSERT(avl_numnodes(&zp->z_range_avl) == 1); - ASSERT(rl->r_off == 0); - ASSERT(rl->r_type == RL_WRITER); - ASSERT(!rl->r_proxy); - ASSERT3U(rl->r_len, ==, UINT64_MAX); - ASSERT3U(rl->r_cnt, ==, 1); - - mutex_enter(&zp->z_range_lock); - rl->r_off = off; - rl->r_len = len; - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) - cv_broadcast(&rl->r_wr_cv); - if (rl->r_read_wanted) - cv_broadcast(&rl->r_rd_cv); -} - -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -int -zfs_range_compare(const void *arg1, const void *arg2) -{ - const rl_t *rl1 = arg1; - const rl_t *rl2 = arg2; - - if (rl1->r_off > rl2->r_off) - return (1); - if (rl1->r_off < rl2->r_off) - return (-1); - return (0); -} diff --git a/zfs/lib/libdmu-ctl/zfs_vfsops.c b/zfs/lib/libdmu-ctl/zfs_vfsops.c deleted file mode 100644 index 39c8ce4ef..000000000 --- a/zfs/lib/libdmu-ctl/zfs_vfsops.c +++ /dev/null @@ -1,1671 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zfs_vfsops.c 1.41 08/04/11 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/kmem.h> -#include <sys/pathname.h> -#include <sys/vnode.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> -#include <sys/mntent.h> -#include <sys/mount.h> -#include <sys/cmn_err.h> -#include "fs/fs_subr.h" -#include <sys/zfs_znode.h> -#include <sys/zfs_dir.h> -#include <sys/zil.h> -#include <sys/fs/zfs.h> -#include <sys/dmu.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_deleg.h> -#include <sys/spa.h> -#include <sys/zap.h> -#include <sys/varargs.h> -#include <sys/policy.h> -#include <sys/atomic.h> -#include <sys/mkdev.h> -#include <sys/modctl.h> -#include <sys/refstr.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_fuid.h> -#include <sys/bootconf.h> -#include <sys/sunddi.h> -#include <sys/dnlc.h> -#include <sys/dmu_objset.h> -#include <sys/spa_boot.h> - -int zfsfstype; -vfsops_t *zfs_vfsops = NULL; -static major_t zfs_major; -static minor_t zfs_minor; -static kmutex_t zfs_dev_mtx; - -static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); -static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); -static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); -static int zfs_root(vfs_t *vfsp, vnode_t **vpp); -static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); -static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); -static void zfs_freevfs(vfs_t *vfsp); - -static const fs_operation_def_t zfs_vfsops_template[] = { - VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, - VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, - VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, - VFSNAME_ROOT, { .vfs_root = zfs_root }, - VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, - VFSNAME_SYNC, { .vfs_sync = zfs_sync }, - VFSNAME_VGET, { .vfs_vget = zfs_vget }, - VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, - NULL, NULL -}; - -static const fs_operation_def_t zfs_vfsops_eio_template[] = { - VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, - NULL, NULL -}; - -/* - * We need to keep a count of active fs's. - * This is necessary to prevent our module - * from being unloaded after a umount -f - */ -static uint32_t zfs_active_fs_count = 0; - -static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; -static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; -static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; -static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; - -/* - * MO_DEFAULT is not used since the default value is determined - * by the equivalent property. - */ -static mntopt_t mntopts[] = { - { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, - { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, - { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, - { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } -}; - -static mntopts_t zfs_mntopts = { - sizeof (mntopts) / sizeof (mntopt_t), - mntopts -}; - -/*ARGSUSED*/ -int -zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) -{ - /* - * Data integrity is job one. We don't want a compromised kernel - * writing to the storage pool, so we never sync during panic. - */ - if (panicstr) - return (0); - - /* - * SYNC_ATTR is used by fsflush() to force old filesystems like UFS - * to sync metadata, which they would otherwise cache indefinitely. - * Semantically, the only requirement is that the sync be initiated. - * The DMU syncs out txgs frequently, so there's nothing to do. - */ - if (flag & SYNC_ATTR) - return (0); - - if (vfsp != NULL) { - /* - * Sync a specific filesystem. - */ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - - ZFS_ENTER(zfsvfs); - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); - else - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - ZFS_EXIT(zfsvfs); - } else { - /* - * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the - * request by waiting for all pools to commit all dirty data. - */ - spa_sync_allpools(); - } - - return (0); -} - -static int -zfs_create_unique_device(dev_t *dev) -{ - major_t new_major; - - do { - ASSERT3U(zfs_minor, <=, MAXMIN32); - minor_t start = zfs_minor; - do { - mutex_enter(&zfs_dev_mtx); - if (zfs_minor >= MAXMIN32) { - /* - * If we're still using the real major - * keep out of /dev/zfs and /dev/zvol minor - * number space. If we're using a getudev()'ed - * major number, we can use all of its minors. - */ - if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) - zfs_minor = ZFS_MIN_MINOR; - else - zfs_minor = 0; - } else { - zfs_minor++; - } - *dev = makedevice(zfs_major, zfs_minor); - mutex_exit(&zfs_dev_mtx); - } while (vfs_devismounted(*dev) && zfs_minor != start); - if (zfs_minor == start) { - /* - * We are using all ~262,000 minor numbers for the - * current major number. Create a new major number. - */ - if ((new_major = getudev()) == (major_t)-1) { - cmn_err(CE_WARN, - "zfs_mount: Can't get unique major " - "device number."); - return (-1); - } - mutex_enter(&zfs_dev_mtx); - zfs_major = new_major; - zfs_minor = 0; - - mutex_exit(&zfs_dev_mtx); - } else { - break; - } - /* CONSTANTCONDITION */ - } while (1); - - return (0); -} - -static void -atime_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == TRUE) { - zfsvfs->z_atime = TRUE; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); - } else { - zfsvfs->z_atime = FALSE; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); - } -} - -static void -xattr_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == TRUE) { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); - } else { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); - } -} - -static void -blksz_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval < SPA_MINBLOCKSIZE || - newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) - newval = SPA_MAXBLOCKSIZE; - - zfsvfs->z_max_blksz = newval; - zfsvfs->z_vfs->vfs_bsize = newval; -} - -static void -readonly_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval) { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); - } else { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); - } -} - -static void -devices_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); - } -} - -static void -setuid_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); - } -} - -static void -exec_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); - } -} - -/* - * The nbmand mount option can be changed at mount time. - * We can't allow it to be toggled on live file systems or incorrect - * behavior may be seen from cifs clients - * - * This property isn't registered via dsl_prop_register(), but this callback - * will be called when a file system is first mounted - */ -static void -nbmand_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - if (newval == FALSE) { - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); - } else { - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); - } -} - -static void -snapdir_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_show_ctldir = newval; -} - -static void -vscan_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_vscan = newval; -} - -static void -acl_mode_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_mode = newval; -} - -static void -acl_inherit_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_inherit = newval; -} - -static int -zfs_register_callbacks(vfs_t *vfsp) -{ - struct dsl_dataset *ds = NULL; - objset_t *os = NULL; - zfsvfs_t *zfsvfs = NULL; - uint64_t nbmand; - int readonly, do_readonly = B_FALSE; - int setuid, do_setuid = B_FALSE; - int exec, do_exec = B_FALSE; - int devices, do_devices = B_FALSE; - int xattr, do_xattr = B_FALSE; - int atime, do_atime = B_FALSE; - int error = 0; - - ASSERT(vfsp); - zfsvfs = vfsp->vfs_data; - ASSERT(zfsvfs); - os = zfsvfs->z_os; - - /* - * The act of registering our callbacks will destroy any mount - * options we may have. In order to enable temporary overrides - * of mount options, we stash away the current values and - * restore them after we register the callbacks. - */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { - readonly = B_TRUE; - do_readonly = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { - readonly = B_FALSE; - do_readonly = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - devices = B_FALSE; - setuid = B_FALSE; - do_devices = B_TRUE; - do_setuid = B_TRUE; - } else { - if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { - devices = B_FALSE; - do_devices = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { - devices = B_TRUE; - do_devices = B_TRUE; - } - - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { - setuid = B_FALSE; - do_setuid = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { - setuid = B_TRUE; - do_setuid = B_TRUE; - } - } - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { - exec = B_FALSE; - do_exec = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { - exec = B_TRUE; - do_exec = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { - xattr = B_FALSE; - do_xattr = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { - xattr = B_TRUE; - do_xattr = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { - atime = B_FALSE; - do_atime = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { - atime = B_TRUE; - do_atime = B_TRUE; - } - - /* - * nbmand is a special property. It can only be changed at - * mount time. - * - * This is weird, but it is documented to only be changeable - * at mount time. - */ - if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { - nbmand = B_FALSE; - } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { - nbmand = B_TRUE; - } else { - char osname[MAXNAMELEN]; - - dmu_objset_name(os, osname); - if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, - NULL)) - return (error); - } - - /* - * Register property callbacks. - * - * It would probably be fine to just check for i/o error from - * the first prop_register(), but I guess I like to go - * overboard... - */ - ds = dmu_objset_ds(os); - error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "xattr", xattr_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "recordsize", blksz_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "readonly", readonly_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "devices", devices_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "setuid", setuid_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "exec", exec_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "snapdir", snapdir_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "aclinherit", acl_inherit_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - "vscan", vscan_changed_cb, zfsvfs); - if (error) - goto unregister; - - /* - * Invoke our callbacks to restore temporary mount options. - */ - if (do_readonly) - readonly_changed_cb(zfsvfs, readonly); - if (do_setuid) - setuid_changed_cb(zfsvfs, setuid); - if (do_exec) - exec_changed_cb(zfsvfs, exec); - if (do_devices) - devices_changed_cb(zfsvfs, devices); - if (do_xattr) - xattr_changed_cb(zfsvfs, xattr); - if (do_atime) - atime_changed_cb(zfsvfs, atime); - - nbmand_changed_cb(zfsvfs, nbmand); - - return (0); - -unregister: - /* - * We may attempt to unregister some callbacks that are not - * registered, but this is OK; it will simply return ENOMSG, - * which we will ignore. - */ - (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, - zfsvfs); - (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); - return (error); - -} - -static int -zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) -{ - uint_t readonly; - int error; - - error = zfs_register_callbacks(zfsvfs->z_vfs); - if (error) - return (error); - - /* - * Set the objset user_ptr to track its zfsvfs. - */ - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); - dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); - - /* - * If we are not mounting (ie: online recv), then we don't - * have to worry about replaying the log as we blocked all - * operations out since we closed the ZIL. - */ - if (mounting) { - /* - * During replay we remove the read only flag to - * allow replays to succeed. - */ - readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; - if (readonly != 0) - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - else - zfs_unlinked_drain(zfsvfs); - - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest doesn't - * use readonly mounts, where zfs_unlinked_drain() isn't - * called.) This is because ziltest causes spa_sync() - * to think it's committed, but actually it is not, so - * the intent log contains many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated in - * a yet later txg. This would write a "create object - * N" record to the intent log. Normally, this would be - * fine because the spa_sync() would have written out - * the fact that object N is free, before we could write - * the "create object N" intent log record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector); - - zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ - } - - if (!zil_disable) - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - - return (0); -} - -static void -zfs_freezfsvfs(zfsvfs_t *zfsvfs) -{ - mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); - list_destroy(&zfsvfs->z_all_znodes); - rrw_destroy(&zfsvfs->z_teardown_lock); - rw_destroy(&zfsvfs->z_teardown_inactive_lock); - rw_destroy(&zfsvfs->z_fuid_lock); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); -} - -static int -zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) -{ - dev_t mount_dev; - uint64_t recordsize, readonly; - int error = 0; - int mode; - zfsvfs_t *zfsvfs; - znode_t *zp = NULL; - - ASSERT(vfsp); - ASSERT(osname); - - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_assign = TXG_NOWAIT; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); - - /* Initialize the generic filesystem structure. */ - vfsp->vfs_bcount = 0; - vfsp->vfs_data = NULL; - - if (zfs_create_unique_device(&mount_dev) == -1) { - error = ENODEV; - goto out; - } - ASSERT(vfs_devismounted(mount_dev) == 0); - - if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, - NULL)) - goto out; - - vfsp->vfs_dev = mount_dev; - vfsp->vfs_fstype = zfsfstype; - vfsp->vfs_bsize = recordsize; - vfsp->vfs_flag |= VFS_NOTRUNC; - vfsp->vfs_data = zfsvfs; - - if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) - goto out; - - if (readonly) - mode = DS_MODE_PRIMARY | DS_MODE_READONLY; - else - mode = DS_MODE_PRIMARY; - - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (error == EROFS) { - mode = DS_MODE_PRIMARY | DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, - &zfsvfs->z_os); - } - - if (error) - goto out; - - if (error = zfs_init_fs(zfsvfs, &zp, cr)) - goto out; - - /* The call to zfs_init_fs leaves the vnode held, release it here. */ - VN_RELE(ZTOV(zp)); - - /* - * Set features for file system. - */ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids) { - vfs_set_feature(vfsp, VFSFT_XVATTR); - vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); - vfs_set_feature(vfsp, VFSFT_ACLONCREATE); - } - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); - vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); - vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); - } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { - vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); - vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); - } - - if (dmu_objset_is_snapshot(zfsvfs->z_os)) { - uint64_t pval; - - ASSERT(mode & DS_MODE_READONLY); - atime_changed_cb(zfsvfs, B_FALSE); - readonly_changed_cb(zfsvfs, B_TRUE); - if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) - goto out; - xattr_changed_cb(zfsvfs, pval); - zfsvfs->z_issnap = B_TRUE; - } else { - error = zfsvfs_setup(zfsvfs, B_TRUE); - } - - if (!zfsvfs->z_issnap) - zfsctl_create(zfsvfs); -out: - if (error) { - if (zfsvfs->z_os) - dmu_objset_close(zfsvfs->z_os); - zfs_freezfsvfs(zfsvfs); - } else { - atomic_add_32(&zfs_active_fs_count, 1); - } - - return (error); -} - -void -zfs_unregister_callbacks(zfsvfs_t *zfsvfs) -{ - objset_t *os = zfsvfs->z_os; - struct dsl_dataset *ds; - - /* - * Unregister properties. - */ - if (!dmu_objset_is_snapshot(os)) { - ds = dmu_objset_ds(os); - VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclinherit", - acl_inherit_changed_cb, zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "vscan", - vscan_changed_cb, zfsvfs) == 0); - } -} - -/* - * Convert a decimal digit string to a uint64_t integer. - */ -static int -str_to_uint64(char *str, uint64_t *objnum) -{ - uint64_t num = 0; - - while (*str) { - if (*str < '0' || *str > '9') - return (EINVAL); - - num = num*10 + *str++ - '0'; - } - - *objnum = num; - return (0); -} - -/* - * The boot path passed from the boot loader is in the form of - * "rootpool-name/root-filesystem-object-number'. Convert this - * string to a dataset name: "rootpool-name/root-filesystem-name". - */ -static int -zfs_parse_bootfs(char *bpath, char *outpath) -{ - char *slashp; - uint64_t objnum; - int error; - - if (*bpath == 0 || *bpath == '/') - return (EINVAL); - - slashp = strchr(bpath, '/'); - - /* if no '/', just return the pool name */ - if (slashp == NULL) { - (void) strcpy(outpath, bpath); - return (0); - } - - if (error = str_to_uint64(slashp+1, &objnum)) - return (error); - - *slashp = '\0'; - error = dsl_dsobj_to_dsname(bpath, objnum, outpath); - *slashp = '/'; - - return (error); -} - -static int -zfs_mountroot(vfs_t *vfsp, enum whymountroot why) -{ - int error = 0; - static int zfsrootdone = 0; - zfsvfs_t *zfsvfs = NULL; - znode_t *zp = NULL; - vnode_t *vp = NULL; - char *zfs_bootfs; - - ASSERT(vfsp); - - /* - * The filesystem that we mount as root is defined in the - * boot property "zfs-bootfs" with a format of - * "poolname/root-dataset-objnum". - */ - if (why == ROOT_INIT) { - if (zfsrootdone++) - return (EBUSY); - /* - * the process of doing a spa_load will require the - * clock to be set before we could (for example) do - * something better by looking at the timestamp on - * an uberblock, so just set it to -1. - */ - clkset(-1); - - if ((zfs_bootfs = spa_get_bootfs()) == NULL) { - cmn_err(CE_NOTE, "\nspa_get_bootfs: can not get " - "bootfs name \n"); - return (EINVAL); - } - - if (error = spa_import_rootpool(rootfs.bo_name)) { - spa_free_bootfs(zfs_bootfs); - cmn_err(CE_NOTE, "\nspa_import_rootpool: error %d\n", - error); - return (error); - } - - if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { - spa_free_bootfs(zfs_bootfs); - cmn_err(CE_NOTE, "\nzfs_parse_bootfs: error %d\n", - error); - return (error); - } - - spa_free_bootfs(zfs_bootfs); - - if (error = vfs_lock(vfsp)) - return (error); - - if (error = zfs_domount(vfsp, rootfs.bo_name, CRED())) { - cmn_err(CE_NOTE, "\nzfs_domount: error %d\n", error); - goto out; - } - - zfsvfs = (zfsvfs_t *)vfsp->vfs_data; - ASSERT(zfsvfs); - if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { - cmn_err(CE_NOTE, "\nzfs_zget: error %d\n", error); - goto out; - } - - vp = ZTOV(zp); - mutex_enter(&vp->v_lock); - vp->v_flag |= VROOT; - mutex_exit(&vp->v_lock); - rootvp = vp; - - /* - * The zfs_zget call above returns with a hold on vp, we release - * it here. - */ - VN_RELE(vp); - - vfs_add((struct vnode *)0, vfsp, - (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); -out: - vfs_unlock(vfsp); - return (error); - } else if (why == ROOT_REMOUNT) { - readonly_changed_cb(vfsp->vfs_data, B_FALSE); - vfsp->vfs_flag |= VFS_REMOUNT; - - /* refresh mount options */ - zfs_unregister_callbacks(vfsp->vfs_data); - return (zfs_register_callbacks(vfsp)); - - } else if (why == ROOT_UNMOUNT) { - zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); - (void) zfs_sync(vfsp, 0, 0); - return (0); - } - - /* - * if "why" is equal to anything else other than ROOT_INIT, - * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. - */ - return (ENOTSUP); -} - -/*ARGSUSED*/ -static int -zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) -{ - char *osname; - pathname_t spn; - int error = 0; - uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? - UIO_SYSSPACE : UIO_USERSPACE; - int canwrite; - - if (mvp->v_type != VDIR) - return (ENOTDIR); - - mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_REMOUNT) == 0 && - (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { - mutex_exit(&mvp->v_lock); - return (EBUSY); - } - mutex_exit(&mvp->v_lock); - - /* - * ZFS does not support passing unparsed data in via MS_DATA. - * Users should use the MS_OPTIONSTR interface; this means - * that all option parsing is already done and the options struct - * can be interrogated. - */ - if ((uap->flags & MS_DATA) && uap->datalen > 0) - return (EINVAL); - - /* - * Get the objset name (the "special" mount argument). - */ - if (error = pn_get(uap->spec, fromspace, &spn)) - return (error); - - osname = spn.pn_path; - - /* - * Check for mount privilege? - * - * If we don't have privilege then see if - * we have local permission to allow it - */ - error = secpolicy_fs_mount(cr, mvp, vfsp); - if (error) { - error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); - if (error == 0) { - vattr_t vattr; - - /* - * Make sure user is the owner of the mount point - * or has sufficient privileges. - */ - - vattr.va_mask = AT_UID; - - if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { - goto out; - } - - if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && - VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { - error = EPERM; - goto out; - } - - secpolicy_fs_mount_clearopts(cr, vfsp); - } else { - goto out; - } - } - - /* - * Refuse to mount a filesystem if we are in a local zone and the - * dataset is not visible. - */ - if (!INGLOBALZONE(curproc) && - (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { - error = EPERM; - goto out; - } - - /* - * When doing a remount, we simply refresh our temporary properties - * according to those options set in the current VFS options. - */ - if (uap->flags & MS_REMOUNT) { - /* refresh mount options */ - zfs_unregister_callbacks(vfsp->vfs_data); - error = zfs_register_callbacks(vfsp); - goto out; - } - - error = zfs_domount(vfsp, osname, cr); - -out: - pn_free(&spn); - return (error); -} - -static int -zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - dev32_t d32; - uint64_t refdbytes, availbytes, usedobjs, availobjs; - - ZFS_ENTER(zfsvfs); - - dmu_objset_space(zfsvfs->z_os, - &refdbytes, &availbytes, &usedobjs, &availobjs); - - /* - * The underlying storage pool actually uses multiple block sizes. - * We report the fragsize as the smallest block size we support, - * and we report our blocksize as the filesystem's maximum blocksize. - */ - statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; - statp->f_bsize = zfsvfs->z_max_blksz; - - /* - * The following report "total" blocks of various kinds in the - * file system, but reported in terms of f_frsize - the - * "fragment" size. - */ - - statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; - statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; - statp->f_bavail = statp->f_bfree; /* no root reservation */ - - /* - * statvfs() should really be called statufs(), because it assumes - * static metadata. ZFS doesn't preallocate files, so the best - * we can do is report the max that could possibly fit in f_files, - * and that minus the number actually used in f_ffree. - * For f_ffree, report the smaller of the number of object available - * and the number of blocks (each object will take at least a block). - */ - statp->f_ffree = MIN(availobjs, statp->f_bfree); - statp->f_favail = statp->f_ffree; /* no "root reservation" */ - statp->f_files = statp->f_ffree + usedobjs; - - (void) cmpldev(&d32, vfsp->vfs_dev); - statp->f_fsid = d32; - - /* - * We're a zfs filesystem. - */ - (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); - - statp->f_flag = vf_to_stf(vfsp->vfs_flag); - - statp->f_namemax = ZFS_MAXNAMELEN; - - /* - * We have all of 32 characters to stuff a string here. - * Is there anything useful we could/should provide? - */ - bzero(statp->f_fstr, sizeof (statp->f_fstr)); - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfs_root(vfs_t *vfsp, vnode_t **vpp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *rootzp; - int error; - - ZFS_ENTER(zfsvfs); - - error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); - if (error == 0) - *vpp = ZTOV(rootzp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Teardown the zfsvfs::z_os. - * - * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' - * and 'z_teardown_inactive_lock' held. - */ -static int -zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) -{ - znode_t *zp; - - rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); - - if (!unmounting) { - /* - * We purge the parent filesystem's vfsp as the parent - * filesystem and all of its snapshots have their vnode's - * v_vfsp set to the parent's filesystem's vfsp. Note, - * 'z_parent' is self referential for non-snapshots. - */ - (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); - } - - /* - * Close the zil. NB: Can't close the zil while zfs_inactive - * threads are blocked as zil_close can call zfs_inactive. - */ - if (zfsvfs->z_log) { - zil_close(zfsvfs->z_log); - zfsvfs->z_log = NULL; - } - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); - - /* - * If we are not unmounting (ie: online recv) and someone already - * unmounted this file system while we were doing the switcheroo, - * or a reopen of z_os failed then just bail out now. - */ - if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { - rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); - return (EIO); - } - - /* - * At this point there are no vops active, and any new vops will - * fail with EIO since we have z_teardown_lock for writer (only - * relavent for forced unmount). - * - * Release all holds on dbufs. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; - zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_dbuf) { - ASSERT(ZTOV(zp)->v_count > 0); - zfs_znode_dmu_fini(zp); - } - mutex_exit(&zfsvfs->z_znodes_lock); - - /* - * If we are unmounting, set the unmounted flag and let new vops - * unblock. zfs_inactive will have the unmounted behavior, and all - * other vops will fail with EIO. - */ - if (unmounting) { - zfsvfs->z_unmounted = B_TRUE; - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - } - - /* - * z_os will be NULL if there was an error in attempting to reopen - * zfsvfs, so just return as the properties had already been - * unregistered and cached data had been evicted before. - */ - if (zfsvfs->z_os == NULL) - return (0); - - /* - * Unregister properties. - */ - zfs_unregister_callbacks(zfsvfs); - - /* - * Evict cached data - */ - if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); - } - - return (0); -} - -/*ARGSUSED*/ -static int -zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - objset_t *os; - int ret; - - ret = secpolicy_fs_unmount(cr, vfsp); - if (ret) { - ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), - ZFS_DELEG_PERM_MOUNT, cr); - if (ret) - return (ret); - } - - /* - * We purge the parent filesystem's vfsp as the parent filesystem - * and all of its snapshots have their vnode's v_vfsp set to the - * parent's filesystem's vfsp. Note, 'z_parent' is self - * referential for non-snapshots. - */ - (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); - - /* - * Unmount any snapshots mounted under .zfs before unmounting the - * dataset itself. - */ - if (zfsvfs->z_ctldir != NULL && - (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { - return (ret); - } - - if (!(fflag & MS_FORCE)) { - /* - * Check the number of active vnodes in the file system. - * Our count is maintained in the vfs structure, but the - * number is off by 1 to indicate a hold on the vfs - * structure itself. - * - * The '.zfs' directory maintains a reference of its - * own, and any active references underneath are - * reflected in the vnode count. - */ - if (zfsvfs->z_ctldir == NULL) { - if (vfsp->vfs_count > 1) - return (EBUSY); - } else { - if (vfsp->vfs_count > 2 || - zfsvfs->z_ctldir->v_count > 1) - return (EBUSY); - } - } - - vfsp->vfs_flag |= VFS_UNMOUNTED; - - VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); - os = zfsvfs->z_os; - - /* - * z_os will be NULL if there was an error in - * attempting to reopen zfsvfs. - */ - if (os != NULL) { - /* - * Unset the objset user_ptr. - */ - mutex_enter(&os->os->os_user_ptr_lock); - dmu_objset_set_user(os, NULL); - mutex_exit(&os->os->os_user_ptr_lock); - - /* - * Finally close the objset - */ - dmu_objset_close(os); - } - - /* - * We can now safely destroy the '.zfs' directory node. - */ - if (zfsvfs->z_ctldir != NULL) - zfsctl_destroy(zfsvfs); - - return (0); -} - -static int -zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - uint64_t object = 0; - uint64_t fid_gen = 0; - uint64_t gen_mask; - uint64_t zp_gen; - int i, err; - - *vpp = NULL; - - ZFS_ENTER(zfsvfs); - - if (fidp->fid_len == LONG_FID_LEN) { - zfid_long_t *zlfid = (zfid_long_t *)fidp; - uint64_t objsetid = 0; - uint64_t setgen = 0; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); - - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); - - ZFS_EXIT(zfsvfs); - - err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); - if (err) - return (EINVAL); - ZFS_ENTER(zfsvfs); - } - - if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { - zfid_short_t *zfid = (zfid_short_t *)fidp; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); - - for (i = 0; i < sizeof (zfid->zf_gen); i++) - fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); - } else { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* A zero fid_gen means we are in the .zfs control directories */ - if (fid_gen == 0 && - (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { - *vpp = zfsvfs->z_ctldir; - ASSERT(*vpp != NULL); - if (object == ZFSCTL_INO_SNAPDIR) { - VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, - 0, NULL, NULL, NULL, NULL, NULL) == 0); - } else { - VN_HOLD(*vpp); - } - ZFS_EXIT(zfsvfs); - return (0); - } - - gen_mask = -1ULL >> (64 - 8 * i); - - dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); - if (err = zfs_zget(zfsvfs, object, &zp)) { - ZFS_EXIT(zfsvfs); - return (err); - } - zp_gen = zp->z_phys->zp_gen & gen_mask; - if (zp_gen == 0) - zp_gen = 1; - if (zp->z_unlinked || zp_gen != fid_gen) { - dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); - VN_RELE(ZTOV(zp)); - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - *vpp = ZTOV(zp); - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Block out VOPs and close zfsvfs_t::z_os - * - * Note, if successful, then we return with the 'z_teardown_lock' and - * 'z_teardown_inactive_lock' write held. - */ -int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) -{ - int error; - - if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) - return (error); - - *mode = zfsvfs->z_os->os_mode; - dmu_objset_name(zfsvfs->z_os, name); - dmu_objset_close(zfsvfs->z_os); - - return (0); -} - -/* - * Reopen zfsvfs_t::z_os and release VOPs. - */ -int -zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) -{ - int err; - - ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); - ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - - err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (err) { - zfsvfs->z_os = NULL; - } else { - znode_t *zp; - - VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); - - /* - * Attempt to re-establish all the active znodes with - * their dbufs. If a zfs_rezget() fails, then we'll let - * any potential callers discover that via ZFS_ENTER_VERIFY_VP - * when they try to use their znode. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; - zp = list_next(&zfsvfs->z_all_znodes, zp)) { - (void) zfs_rezget(zp); - } - mutex_exit(&zfsvfs->z_znodes_lock); - - } - - /* release the VOPs */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); - - if (err) { - /* - * Since we couldn't reopen zfsvfs::z_os, force - * unmount this file system. - */ - if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) - (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); - } - return (err); -} - -static void -zfs_freevfs(vfs_t *vfsp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - int i; - - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - - zfs_fuid_destroy(zfsvfs); - zfs_freezfsvfs(zfsvfs); - - atomic_add_32(&zfs_active_fs_count, -1); -} - -/* - * VFS_INIT() initialization. Note that there is no VFS_FINI(), - * so we can't safely do any non-idempotent initialization here. - * Leave that to zfs_init() and zfs_fini(), which are called - * from the module's _init() and _fini() entry points. - */ -/*ARGSUSED*/ -static int -zfs_vfsinit(int fstype, char *name) -{ - int error; - - zfsfstype = fstype; - - /* - * Setup vfsops and vnodeops tables. - */ - error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); - if (error != 0) { - cmn_err(CE_WARN, "zfs: bad vfs ops template"); - } - - error = zfs_create_op_tables(); - if (error) { - zfs_remove_op_tables(); - cmn_err(CE_WARN, "zfs: bad vnode ops template"); - (void) vfs_freevfsops_by_type(zfsfstype); - return (error); - } - - mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - - /* - * Unique major number for all zfs mounts. - * If we run out of 32-bit minors, we'll getudev() another major. - */ - zfs_major = ddi_name_to_major(ZFS_DRIVER); - zfs_minor = ZFS_MIN_MINOR; - - return (0); -} - -void -zfs_init(void) -{ - /* - * Initialize .zfs directory structures - */ - zfsctl_init(); - - /* - * Initialize znode cache, vnode ops, etc... - */ - zfs_znode_init(); -} - -void -zfs_fini(void) -{ - zfsctl_fini(); - zfs_znode_fini(); -} - -int -zfs_busy(void) -{ - return (zfs_active_fs_count != 0); -} - -int -zfs_set_version(const char *name, uint64_t newvers) -{ - int error; - objset_t *os; - dmu_tx_t *tx; - uint64_t curvers; - - /* - * XXX for now, require that the filesystem be unmounted. Would - * be nice to find the zfsvfs_t and just update that if - * possible. - */ - - if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) - return (EINVAL); - - error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_PRIMARY, &os); - if (error) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &curvers); - if (error) - goto out; - if (newvers < curvers) { - error = EINVAL; - goto out; - } - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto out; - } - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, - &newvers, tx); - - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, - dmu_objset_id(os)); - dmu_tx_commit(tx); - -out: - dmu_objset_close(os); - return (error); -} - -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - const char *pname; - int error; - - /* - * Look up the file system's value for the property. For the - * version property, we look up a slightly different string. - */ - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - default: - return (error); - } - error = 0; - } - return (error); -} - -static vfsdef_t vfw = { - VFSDEF_VERSION, - MNTTYPE_ZFS, - zfs_vfsinit, - VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| - VSW_XID, - &zfs_mntopts -}; - -struct modlfs zfs_modlfs = { - &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw -}; diff --git a/zfs/lib/libdmu-ctl/zfs_vnops.c b/zfs/lib/libdmu-ctl/zfs_vnops.c deleted file mode 100644 index 3f36328de..000000000 --- a/zfs/lib/libdmu-ctl/zfs_vnops.c +++ /dev/null @@ -1,4558 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Portions Copyright 2007 Jeremy Teo */ - -#pragma ident "@(#)zfs_vnops.c 1.73 08/04/27 SMI" - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/sysmacros.h> -#include <sys/resource.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/kmem.h> -#include <sys/taskq.h> -#include <sys/uio.h> -#include <sys/vmsystm.h> -#include <sys/atomic.h> -#include <sys/vm.h> -#include <vm/seg_vn.h> -#include <vm/pvn.h> -#include <vm/as.h> -#include <sys/mman.h> -#include <sys/pathname.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/unistd.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_ioctl.h> -#include <sys/fs/zfs.h> -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/dbuf.h> -#include <sys/zap.h> -#include <sys/dirent.h> -#include <sys/policy.h> -#include <sys/sunddi.h> -#include <sys/filio.h> -#include "fs/fs_subr.h" -#include <sys/zfs_ctldir.h> -#include <sys/zfs_fuid.h> -#include <sys/dnlc.h> -#include <sys/zfs_rlock.h> -#include <sys/extdirent.h> -#include <sys/kidmap.h> -#include <sys/cred_impl.h> -#include <sys/attr.h> - -/* - * Programming rules. - * - * Each vnode op performs some logical unit of work. To do this, the ZPL must - * properly lock its in-core state, create a DMU transaction, do the work, - * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait for the intent log to commit if it is a synchronous operation. - * Moreover, the vnode ops must work in both normal and log replay context. - * The ordering of events is important to avoid deadlocks and references - * to freed memory. The example below illustrates the following Big Rules: - * - * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. - * - * (2) VN_RELE() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * - * (3) All range locks must be grabbed before calling dmu_tx_assign(), - * as they can span dmu_tx_assign() calls. - * - * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). - * In normal operation, this will be TXG_NOWAIT. During ZIL replay, - * it will be a specific txg. Either way, dmu_tx_assign() never blocks. - * This is critical because we don't want to block while holding locks. - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing to - * use a non-blocking assign can deadlock the system. The scenario: - * - * Thread A has grabbed a lock before calling dmu_tx_assign(). - * Thread B is in an already-assigned tx, and blocks for this lock. - * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() - * forever, because the previous txg can't quiesce until B's tx commits. - * - * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. - * - * (5) If the operation succeeded, generate the intent log entry for it - * before dropping locks. This ensures that the ordering of events - * in the intent log matches the order in which they actually occurred. - * - * (6) At the end of each vnode op, the DMU tx must always commit, - * regardless of whether there were any errors. - * - * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) - * to ensure that synchronous semantics are provided when necessary. - * - * In general, this is how things should be ordered in each vnode op: - * - * ZFS_ENTER(zfsvfs); // exit if unmounted - * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) - * rw_enter(...); // grab any other locks you need - * tx = dmu_tx_create(...); // get DMU tx - * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign - * if (error) { - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * VN_RELE(...); // release held vnodes - * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - * dmu_tx_wait(tx); - * dmu_tx_abort(tx); - * goto top; - * } - * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // really out of space - * } - * error = do_real_work(); // do whatever this VOP does - * if (error == 0) - * zfs_log_*(...); // on success, make ZIL entry - * dmu_tx_commit(tx); // commit DMU tx -- error or not - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * VN_RELE(...); // release held vnodes - * zil_commit(zilog, seq, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // done, report error - */ - -/* ARGSUSED */ -static int -zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(*vpp); - - if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && - ((flag & FAPPEND) == 0)) { - return (EPERM); - } - - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && - ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) - if (fs_vscan(*vpp, cr, 0) != 0) - return (EACCES); - - /* Keep a count of the synchronous opens in the znode */ - if (flag & (FSYNC | FDSYNC)) - atomic_inc_32(&zp->z_sync_cnt); - - return (0); -} - -/* ARGSUSED */ -static int -zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - - /* Decrement the synchronous opens in the znode */ - if ((flag & (FSYNC | FDSYNC)) && (count == 1)) - atomic_dec_32(&zp->z_sync_cnt); - - /* - * Clean up any locks held by this process on the vp. - */ - cleanlocks(vp, ddi_get_pid(), 0); - cleanshares(vp, ddi_get_pid()); - - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && - ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) - VERIFY(fs_vscan(vp, cr, 1) == 0); - - return (0); -} - -/* - * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and - * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey(vnode_t *vp, int cmd, offset_t *off) -{ - znode_t *zp = VTOZ(vp); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_phys->zp_size; - if (noff >= file_sz) { - return (ENXIO); - } - - if (cmd == _FIO_SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); - - /* end of file? */ - if ((error == ESRCH) || (noff > file_sz)) { - /* - * Handle the virtual hole at the end of file. - */ - if (hole) { - *off = file_sz; - return (0); - } - return (ENXIO); - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - -/* ARGSUSED */ -static int -zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, - int *rvalp, caller_context_t *ct) -{ - offset_t off; - int error; - zfsvfs_t *zfsvfs; - znode_t *zp; - - switch (com) { - case _FIOFFS: - return (zfs_sync(vp->v_vfsp, 0, cred)); - - /* - * The following two ioctls are used by bfu. Faking out, - * necessary to avoid bfu errors. - */ - case _FIOGDIO: - case _FIOSDIO: - return (0); - - case _FIO_SEEK_DATA: - case _FIO_SEEK_HOLE: - if (ddi_copyin((void *)data, &off, sizeof (off), flag)) - return (EFAULT); - - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* offset parameter is in/out */ - error = zfs_holey(vp, com, &off); - ZFS_EXIT(zfsvfs); - if (error) - return (error); - if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) - return (EFAULT); - return (0); - } - return (ENOTTY); -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int64_t start, off; - int len = nbytes; - int error = 0; - - start = uio->uio_loffset; - off = start & PAGEOFFSET; - for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - page_t *pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t woff = uio->uio_loffset; - - /* - * We don't want a new page to "appear" in the middle of - * the file update (because it may not get the write - * update data), so we grab a lock to block - * zfs_getpage(). - */ - rw_enter(&zp->z_map_lock, RW_WRITER); - if (pp = page_lookup(vp, start, SE_SHARED)) { - caddr_t va; - - rw_exit(&zp->z_map_lock); - va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); - error = uiomove(va+off, bytes, UIO_WRITE, uio); - if (error == 0) { - dmu_write(zfsvfs->z_os, zp->z_id, - woff, bytes, va+off, tx); - } - ppmapout(va); - page_unlock(pp); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, bytes, tx); - rw_exit(&zp->z_map_lock); - } - len -= bytes; - off = 0; - if (error) - break; - } - return (error); -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedread(vnode_t *vp, int nbytes, uio_t *uio) -{ - znode_t *zp = VTOZ(vp); - objset_t *os = zp->z_zfsvfs->z_os; - int64_t start, off; - int len = nbytes; - int error = 0; - - start = uio->uio_loffset; - off = start & PAGEOFFSET; - for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - page_t *pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - - if (pp = page_lookup(vp, start, SE_SHARED)) { - caddr_t va; - - va = ppmapin(pp, PROT_READ, (caddr_t)-1L); - error = uiomove(va + off, bytes, UIO_READ, uio); - ppmapout(va); - page_unlock(pp); - } else { - error = dmu_read_uio(os, zp->z_id, uio, bytes); - } - len -= bytes; - off = 0; - if (error) - break; - } - return (error); -} - -offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - -/* - * Read bytes from specified file into supplied buffer. - * - * IN: vp - vnode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - SYNC flags; used to provide FRSYNC semantics. - * cr - credentials of caller. - * ct - caller context - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 if success - * error code if failure - * - * Side Effects: - * vp - atime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os; - ssize_t n, nbytes; - int error; - rl_t *rl; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - os = zfsvfs->z_os; - - if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (EACCES); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * Check for mandatory locks - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { - if (error = chklock(vp, FREAD, - uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - */ - if (ioflag & FRSYNC) - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); - - /* - * Lock the range against changes. - */ - rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_phys->zp_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_phys->zp_size); - n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); - - while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (vn_has_cached_data(vp)) - error = mappedread(vp, nbytes, uio); - else - error = dmu_read_uio(os, zp->z_id, uio, nbytes); - if (error) - break; - - n -= nbytes; - } - -out: - zfs_range_unlock(rl); - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. - * Any error will exit this routine as this is only a best - * attempt to get the pages resident. This is a copy of ufs_trans_touch(). - */ -static void -zfs_prefault_write(ssize_t n, struct uio *uio) -{ - struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - uint8_t tmp; - - iov = uio->uio_iov; - - while (n) { - cnt = MIN(iov->iov_len, n); - if (cnt == 0) { - /* empty iov entry */ - iov++; - continue; - } - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base; - while (cnt) { - switch (uio->uio_segflg) { - case UIO_USERSPACE: - case UIO_USERISPACE: - if (fuword8(p, &tmp)) - return; - break; - case UIO_SYSSPACE: - if (kcopy(p, &tmp, 1)) - return; - break; - } - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - switch (uio->uio_segflg) { - case UIO_USERSPACE: - case UIO_USERISPACE: - if (fuword8(p, &tmp)) - return; - break; - case UIO_SYSSPACE: - if (kcopy(p, &tmp, 1)) - return; - break; - } - iov++; - } -} - -/* - * Write the bytes to a file. - * - * IN: vp - vnode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - FAPPEND flag set if in append mode. - * cr - credentials of caller. - * ct - caller context (NFS/CIFS fem monitor only) - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime|mtime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - rlim64_t limit = uio->uio_llimit; - ssize_t start_resid = uio->uio_resid; - ssize_t tx_bytes; - uint64_t end_size; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog; - offset_t woff; - ssize_t n, nbytes; - rl_t *rl; - int max_blksz = zfsvfs->z_max_blksz; - uint64_t pflags = zp->z_phys->zp_flags; - int error; - - /* - * If immutable or not appending then return EPERM - */ - if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_phys->zp_size))) - return (EPERM); - - /* - * Fasttrack empty write - */ - n = start_resid; - if (n == 0) - return (0); - - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zilog = zfsvfs->z_log; - - /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - */ - zfs_prefault_write(n, uio); - - /* - * If in append mode, set the io offset pointer to eof. - */ - if (ioflag & FAPPEND) { - /* - * Range lock for a file append: - * The value for the start of range will be determined by - * zfs_range_lock() (to guarantee append semantics). - * If this write will cause the block size to increase, - * zfs_range_lock() will lock the entire file, so we must - * later reduce the range after we grow the block size. - */ - rl = zfs_range_lock(zp, 0, n, RL_APPEND); - if (rl->r_len == UINT64_MAX) { - /* overlocked, zp_size can't change */ - woff = uio->uio_loffset = zp->z_phys->zp_size; - } else { - woff = uio->uio_loffset = rl->r_off; - } - } else { - woff = uio->uio_loffset; - /* - * Validate file offset - */ - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * If we need to grow the block size then zfs_range_lock() - * will lock a wider range than we request here. - * Later after growing the block size we reduce the range. - */ - rl = zfs_range_lock(zp, woff, n, RL_WRITER); - } - - if (woff >= limit) { - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (EFBIG); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* - * Check for mandatory locks - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && - (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (error); - } - end_size = MAX(zp->z_phys->zp_size, woff + n); - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - /* - * Start a transaction. - */ - woff = uio->uio_loffset; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - continue; - } - dmu_tx_abort(tx); - break; - } - - /* - * If zfs_range_lock() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. - */ - if (rl->r_len == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - rw_enter(&zp->z_map_lock, RW_READER); - - tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); - } - tx_bytes -= uio->uio_resid; - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the excute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - (zp->z_phys->zp_mode & S_ISUID) != 0 && - zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); - } - mutex_exit(&zp->z_acl_lock); - - /* - * Update time stamp. NOTE: This marks the bonus buffer as - * dirty, so we don't have to do it again for zp_size. - */ - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) - (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, - uio->uio_loffset); - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - } - - zfs_range_unlock(rl); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (FSYNC | FDSYNC)) - zil_commit(zilog, zp->z_last_itx, zp->z_id); - - ZFS_EXIT(zfsvfs); - return (0); -} - -void -zfs_get_done(dmu_buf_t *db, void *vzgd) -{ - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; - vnode_t *vp = ZTOV(rl->r_zp); - - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - VN_RELE(vp); - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); - kmem_free(zgd, sizeof (zgd_t)); -} - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t off = lr->lr_offset; - dmu_buf_t *db; - rl_t *rl; - zgd_t *zgd; - int dlen = lr->lr_length; /* length of user data */ - int error = 0; - - ASSERT(zio); - ASSERT(dlen != 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) - return (ENOENT); - if (zp->z_unlinked) { - VN_RELE(ZTOV(zp)); - return (ENOENT); - } - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - rl = zfs_range_lock(zp, off, dlen, RL_READER); - /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { - error = ENOENT; - goto out; - } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); - } else { /* indirect write */ - uint64_t boff; /* block starting offset */ - - /* - * Have to lock the whole block to ensure when it's - * written out and it's checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - if (ISP2(zp->z_blksz)) { - boff = P2ALIGN_TYPED(off, zp->z_blksz, - uint64_t); - } else { - boff = 0; - } - dlen = zp->z_blksz; - rl = zfs_range_lock(zp, boff, dlen, RL_READER); - if (zp->z_blksz == dlen) - break; - zfs_range_unlock(rl); - } - /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { - error = ENOENT; - goto out; - } - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_rl = rl; - zgd->zgd_zilog = zfsvfs->z_log; - zgd->zgd_bp = &lr->lr_blkptr; - VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); - ASSERT(boff == db->db_offset); - lr->lr_blkoff = off - boff; - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zfs_get_done, zgd); - ASSERT((error && error != EINPROGRESS) || - lr->lr_length <= zp->z_blksz); - if (error == 0) - zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zfs_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - kmem_free(zgd, sizeof (zgd_t)); - } -out: - zfs_range_unlock(rl); - VN_RELE(ZTOV(zp)); - return (error); -} - -/*ARGSUSED*/ -static int -zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Lookup an entry in a directory, or an extended attribute directory. - * If it exists, return a held vnode reference for it. - * - * IN: dvp - vnode of directory to search. - * nm - name of entry to lookup. - * pnp - full pathname to lookup [UNUSED]. - * flags - LOOKUP_XATTR set if looking for an attribute. - * rdir - root directory vnode [UNUSED]. - * cr - credentials of caller. - * ct - caller context - * direntflags - directory lookup flags - * realpnp - returned pathname. - * - * OUT: vpp - vnode of located entry, NULL if not found. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * NA - */ -/* ARGSUSED */ -static int -zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - znode_t *zdp = VTOZ(dvp); - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); - - *vpp = NULL; - - if (flags & LOOKUP_XATTR) { - /* - * If the xattr property is off, refuse the lookup request. - */ - if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * We don't allow recursive attributes.. - * Maybe someday we will. - */ - if (zdp->z_phys->zp_flags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Do we have permission to get into attribute directory? - */ - - if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, - B_FALSE, cr)) { - VN_RELE(*vpp); - *vpp = NULL; - } - - ZFS_EXIT(zfsvfs); - return (error); - } - - if (dvp->v_type != VDIR) { - ZFS_EXIT(zfsvfs); - return (ENOTDIR); - } - - /* - * Check accessibility of directory. - */ - - if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); - } - - error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); - if (error == 0) { - /* - * Convert device special files - */ - if (IS_DEVVP(*vpp)) { - vnode_t *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) - error = ENOSYS; - else - *vpp = svp; - } - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Attempt to create a new entry in a directory. If the entry - * already exists, truncate the file if permissible, else return - * an error. Return the vp of the created or trunc'd file. - * - * IN: dvp - vnode of directory to put new file entry in. - * name - name of new file entry. - * vap - attributes of new file. - * excl - flag indicating exclusive or non-exclusive mode. - * mode - mode to open file with. - * cr - credentials of caller. - * flag - large file flag [UNUSED]. - * ct - caller context - * vsecp - ACL to be set - * - * OUT: vpp - vnode of created or trunc'd entry. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated if new entry created - * vp - ctime|mtime always, atime if new - */ - -/* ARGSUSED */ -static int -zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, - int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, - vsecattr_t *vsecp) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - objset_t *os; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || (vap->va_mask & AT_XVATTR) || - IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) - return (EINVAL); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); - } - - if (vap->va_mask & AT_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_type)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } -top: - *vpp = NULL; - - if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) - vap->va_mode &= ~VSVTX; - - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - VN_HOLD(dvp); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible VN_HOLD(zp) */ - int zflg = 0; - - if (flag & FIGNORECASE) - zflg |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL); - if (error) { - if (strcmp(name, "..") == 0) - error = EISDIR; - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (error); - } - } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - ZFS_EXIT(zfsvfs); - if (dl) - zfs_dirent_unlock(dl); - return (error); - } - } - - if (zp == NULL) { - uint64_t txtype; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - if ((dzp->z_phys->zp_flags & ZFS_XATTR) && - (vap->va_type != VREG)) { - error = EINVAL; - goto out; - } - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || - IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); - (void) zfs_link_create(dl, zp, tx, ZNEW); - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - if (flag & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); - dmu_tx_commit(tx); - } else { - int aflags = (flag & FAPPEND) ? V_APPEND : 0; - - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl == EXCL) { - error = EEXIST; - goto out; - } - /* - * Can't open a directory for writing. - */ - if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { - error = EISDIR; - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { - goto out; - } - - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); - - /* - * Truncate regular files if requested. - */ - if ((ZTOV(zp)->v_type == VREG) && - (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { - error = zfs_freesp(zp, 0, 0, mode, TRUE); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - /* NB: we already did dmu_tx_wait() */ - zfs_dirent_unlock(dl); - VN_RELE(ZTOV(zp)); - goto top; - } - - if (error == 0) { - vnevent_create(ZTOV(zp), ct); - } - } - } -out: - - if (dl) - zfs_dirent_unlock(dl); - - if (error) { - if (zp) - VN_RELE(ZTOV(zp)); - } else { - *vpp = ZTOV(zp); - /* - * If vnode is for a device return a specfs vnode instead. - */ - if (IS_DEVVP(*vpp)) { - struct vnode *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) { - error = ENOSYS; - } - *vpp = svp; - } - } - if (aclp) - zfs_acl_free(aclp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove an entry from a directory. - * - * IN: dvp - vnode of directory to remove entry from. - * name - name of entry to remove. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime - * vp - ctime (if nlink > 0) - */ -/*ARGSUSED*/ -static int -zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, - int flags) -{ - znode_t *zp, *dzp = VTOZ(dvp); - znode_t *xzp = NULL; - vnode_t *vp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - uint64_t acl_obj, xattr_obj; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked; - uint64_t txtype; - pathname_t *realnmp = NULL; - pathname_t realnm; - int error; - int zflg = ZEXISTS; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) { - zflg |= ZCILOOK; - pn_alloc(&realnm); - realnmp = &realnm; - } - -top: - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, realnmp)) { - if (realnmp) - pn_free(realnmp); - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); - - if (error = zfs_zaccess_delete(dzp, zp, cr)) { - goto out; - } - - /* - * Need to use rmdir for removing directories. - */ - if (vp->v_type == VDIR) { - error = EPERM; - goto out; - } - - vnevent_remove(vp, dvp, name, ct); - - if (realnmp) - dnlc_remove(dvp, realnmp->pn_buf); - else - dnlc_remove(dvp, name); - - mutex_enter(&vp->v_lock); - may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); - mutex_exit(&vp->v_lock); - - /* - * We may delete the znode now, or we may put it in the unlinked set; - * it depends on whether we're the last link, and on whether there are - * other holds on the vnode. So we dmu_tx_hold() the right things to - * allow for either case. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); - if (may_delete_now) - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - - /* are there any extended attributes? */ - if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { - /* XXX - do we need this if we are deleting? */ - dmu_tx_hold_bonus(tx, xattr_obj); - } - - /* are there any additional acls */ - if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && - may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - - /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - if (realnmp) - pn_free(realnmp); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Remove the directory entry. - */ - error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); - - if (error) { - dmu_tx_commit(tx); - goto out; - } - - if (unlinked) { - mutex_enter(&vp->v_lock); - delete_now = may_delete_now && - vp->v_count == 1 && !vn_has_cached_data(vp) && - zp->z_phys->zp_xattr == xattr_obj && - zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; - mutex_exit(&vp->v_lock); - } - - if (delete_now) { - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT3U(error, ==, 0); - ASSERT3U(xzp->z_phys->zp_links, ==, 2); - dmu_buf_will_dirty(xzp->z_dbuf, tx); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = 1; - xzp->z_phys->zp_links = 0; - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - zp->z_phys->zp_xattr = 0; /* probably unnecessary */ - } - mutex_enter(&zp->z_lock); - mutex_enter(&vp->v_lock); - vp->v_count--; - ASSERT3U(vp->v_count, ==, 0); - mutex_exit(&vp->v_lock); - mutex_exit(&zp->z_lock); - zfs_znode_delete(zp, tx); - } else if (unlinked) { - zfs_unlinked_add(zp, tx); - } - - txtype = TX_REMOVE; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); - - dmu_tx_commit(tx); -out: - if (realnmp) - pn_free(realnmp); - - zfs_dirent_unlock(dl); - - if (!delete_now) { - VN_RELE(vp); - } else if (xzp) { - /* this rele delayed to prevent nesting transactions */ - VN_RELE(ZTOV(xzp)); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Create a new directory and insert it into dvp using the name - * provided. Return a pointer to the inserted directory. - * - * IN: dvp - vnode of directory to add subdir to. - * dirname - name of new directory. - * vap - attributes of new directory. - * cr - credentials of caller. - * ct - caller context - * vsecp - ACL to be set - * - * OUT: vpp - vnode of created directory. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated - * vp - ctime|mtime|atime updated - */ -/*ARGSUSED*/ -static int -zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, - caller_context_t *ct, int flags, vsecattr_t *vsecp) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - zfs_dirlock_t *dl; - uint64_t txtype; - dmu_tx_t *tx; - int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; - int zf = ZNEW; - - ASSERT(vap->va_type == VDIR); - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| - IS_EPHEMERAL(crgetgid(cr)))) - return (EINVAL); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (dzp->z_phys->zp_flags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - if (zfsvfs->z_utf8 && u8_validate(dirname, - strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - if (vap->va_mask & AT_XVATTR) - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_type)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * First make sure the new directory doesn't exist. - */ -top: - *vpp = NULL; - - if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, - NULL, NULL)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - } - /* - * Add a new entry to the directory. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || - IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (error); - } - - /* - * Create new node. - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); - - if (aclp) - zfs_acl_free(aclp); - - /* - * Now put new name in parent dir. - */ - (void) zfs_link_create(dl, zp, tx, ZNEW); - - *vpp = ZTOV(zp); - - txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); - - if (fuidp) - zfs_fuid_info_free(fuidp); - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Remove a directory subdir entry. If the current working - * directory is the same as the subdir to be removed, the - * remove will fail. - * - * IN: dvp - vnode of directory to remove from. - * name - name of directory to be removed. - * cwd - vnode of current working directory. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, - caller_context_t *ct, int flags) -{ - znode_t *dzp = VTOZ(dvp); - znode_t *zp; - vnode_t *vp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zflg = ZEXISTS; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); - - if (error = zfs_zaccess_delete(dzp, zp, cr)) { - goto out; - } - - if (vp->v_type != VDIR) { - error = ENOTDIR; - goto out; - } - - if (vp == cwd) { - error = EINVAL; - goto out; - } - - vnevent_rmdir(vp, dvp, name, ct); - - /* - * Grab a lock on the directory to make sure that noone is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_destroy(dl, zp, tx, zflg, NULL); - - if (error == 0) { - uint64_t txtype = TX_RMDIR; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); - } - - dmu_tx_commit(tx); - - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); -out: - zfs_dirent_unlock(dl); - - VN_RELE(vp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Read as many directory entries as will fit into the provided - * buffer from the given directory cursor position (specified in - * the uio structure. - * - * IN: vp - vnode of directory to read. - * uio - structure supplying read location, range info, - * and return buffer. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * OUT: uio - updated offset and range, buffer filled. - * eofp - set to true if end-of-file detected. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - atime updated - * - * Note that the low 4 bits of the cookie returned by zap is always zero. - * This allows us to use the low range for "special" directory entries: - * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, - * we use the offset 2 for the '.zfs' directory. - */ -/* ARGSUSED */ -static int -zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, - caller_context_t *ct, int flags) -{ - znode_t *zp = VTOZ(vp); - iovec_t *iovp; - edirent_t *eodp; - dirent64_t *odp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os; - caddr_t outbuf; - size_t bufsize; - zap_cursor_t zc; - zap_attribute_t zap; - uint_t bytes_wanted; - uint64_t offset; /* must be unsigned; checks for < 1 */ - int local_eof; - int outcount; - int error; - uint8_t prefetch; - boolean_t check_sysattrs; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * If we are not given an eof variable, - * use a local one. - */ - if (eofp == NULL) - eofp = &local_eof; - - /* - * Check for valid iov_len. - */ - if (uio->uio_iov->iov_len <= 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * Quit if directory has been removed (posix) - */ - if ((*eofp = zp->z_unlinked) != 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - error = 0; - os = zfsvfs->z_os; - offset = uio->uio_loffset; - prefetch = zp->z_zn_prefetch; - - /* - * Initialize the iterator cursor. - */ - if (offset <= 3) { - /* - * Start iteration from the beginning of the directory. - */ - zap_cursor_init(&zc, os, zp->z_id); - } else { - /* - * The offset is a serialized cursor. - */ - zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } - - /* - * Get space to change directory entries into fs independent format. - */ - iovp = uio->uio_iov; - bytes_wanted = iovp->iov_len; - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { - bufsize = bytes_wanted; - outbuf = kmem_alloc(bufsize, KM_SLEEP); - odp = (struct dirent64 *)outbuf; - } else { - bufsize = bytes_wanted; - odp = (struct dirent64 *)iovp->iov_base; - } - eodp = (struct edirent *)odp; - - /* - * If this VFS supports system attributes; and we're looking at an - * extended attribute directory; and we care about normalization - * conflicts on this vfs; then we must check for normalization - * conflicts with the sysattr name space. - */ - check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) && - (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && - (flags & V_RDDIR_ENTFLAGS); - - /* - * Transform to file-system independent format - */ - outcount = 0; - while (outcount < bytes_wanted) { - ino64_t objnum; - ushort_t reclen; - off64_t *next; - - /* - * Special case `.', `..', and `.zfs'. - */ - if (offset == 0) { - (void) strcpy(zap.za_name, "."); - zap.za_normalization_conflict = 0; - objnum = zp->z_id; - } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); - zap.za_normalization_conflict = 0; - objnum = zp->z_phys->zp_parent; - } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); - zap.za_normalization_conflict = 0; - objnum = ZFSCTL_INO_ROOT; - } else { - /* - * Grab next entry. - */ - if (error = zap_cursor_retrieve(&zc, &zap)) { - if ((*eofp = (error == ENOENT)) != 0) - break; - else - goto update; - } - - if (zap.za_integer_length != 8 || - zap.za_num_integers != 1) { - cmn_err(CE_WARN, "zap_readdir: bad directory " - "entry, obj = %lld, offset = %lld\n", - (u_longlong_t)zp->z_id, - (u_longlong_t)offset); - error = ENXIO; - goto update; - } - - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); - /* - * MacOS X can extract the object type here such as: - * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); - */ - - if (check_sysattrs && !zap.za_normalization_conflict) { - zap.za_normalization_conflict = - xattr_sysattr_casechk(zap.za_name); - } - } - - if (flags & V_RDDIR_ENTFLAGS) - reclen = EDIRENT_RECLEN(strlen(zap.za_name)); - else - reclen = DIRENT64_RECLEN(strlen(zap.za_name)); - - /* - * Will this entry fit in the buffer? - */ - if (outcount + reclen > bufsize) { - /* - * Did we manage to fit anything in the buffer? - */ - if (!outcount) { - error = EINVAL; - goto update; - } - break; - } - if (flags & V_RDDIR_ENTFLAGS) { - /* - * Add extended flag entry: - */ - eodp->ed_ino = objnum; - eodp->ed_reclen = reclen; - /* NOTE: ed_off is the offset for the *next* entry */ - next = &(eodp->ed_off); - eodp->ed_eflags = zap.za_normalization_conflict ? - ED_CASE_CONFLICT : 0; - (void) strncpy(eodp->ed_name, zap.za_name, - EDIRENT_NAMELEN(reclen)); - eodp = (edirent_t *)((intptr_t)eodp + reclen); - } else { - /* - * Add normal entry: - */ - odp->d_ino = objnum; - odp->d_reclen = reclen; - /* NOTE: d_off is the offset for the *next* entry */ - next = &(odp->d_off); - (void) strncpy(odp->d_name, zap.za_name, - DIRENT64_NAMELEN(reclen)); - odp = (dirent64_t *)((intptr_t)odp + reclen); - } - outcount += reclen; - - ASSERT(outcount <= bufsize); - - /* Prefetch znode */ - if (prefetch) - dmu_prefetch(os, objnum, 0, 0); - - /* - * Move to the next entry, fill in the previous offset. - */ - if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); - offset = zap_cursor_serialize(&zc); - } else { - offset += 1; - } - *next = offset; - } - zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ - - if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { - iovp->iov_base += outcount; - iovp->iov_len -= outcount; - uio->uio_resid -= outcount; - } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { - /* - * Reset the pointer. - */ - offset = uio->uio_loffset; - } - -update: - zap_cursor_fini(&zc); - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) - kmem_free(outbuf, bufsize); - - if (error == ENOENT) - error = 0; - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - - uio->uio_loffset = offset; - ZFS_EXIT(zfsvfs); - return (error); -} - -ulong_t zfs_fsync_sync_cnt = 4; - -static int -zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - /* - * Regardless of whether this is required for standards conformance, - * this is the logical behavior when fsync() is called on a file with - * dirty pages. We use B_ASYNC since the ZIL transactions are already - * going to be pushed out as part of the zil_commit(). - */ - if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && - (vp->v_type == VREG) && !(IS_SWAPVP(vp))) - (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); - ZFS_EXIT(zfsvfs); - return (0); -} - - -/* - * Get the requested file attributes and place them in the provided - * vattr structure. - * - * IN: vp - vnode of file. - * vap - va_mask identifies requested attributes. - * If AT_XVATTR set, then optional attrs are requested - * flags - ATTR_NOACLCHECK (CIFS server context) - * cr - credentials of caller. - * ct - caller context - * - * OUT: vap - attribute values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -static int -zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp; - int error = 0; - uint64_t links; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap = NULL; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; - - mutex_enter(&zp->z_lock); - - /* - * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. - * Also, if we are the owner don't bother, since owner should - * always be allowed to read basic attributes of file. - */ - if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && - (pzp->zp_uid != crgetuid(cr))) { - if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, - skipaclchk, cr)) { - mutex_exit(&zp->z_lock); - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * Return all attributes. It's cheaper to provide the answer - * than to determine whether we were asked the question. - */ - - vap->va_type = vp->v_type; - vap->va_mode = pzp->zp_mode & MODEMASK; - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); - vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; - vap->va_nodeid = zp->z_id; - if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) - links = pzp->zp_links + 1; - else - links = pzp->zp_links; - vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ - vap->va_size = pzp->zp_size; - vap->va_rdev = vp->v_rdev; - vap->va_seq = zp->z_seq; - - /* - * Add in any requested optional attributes and the create time. - * Also set the corresponding bits in the returned attribute bitmap. - */ - if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - xoap->xoa_archive = - ((pzp->zp_flags & ZFS_ARCHIVE) != 0); - XVA_SET_RTN(xvap, XAT_ARCHIVE); - } - - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - xoap->xoa_readonly = - ((pzp->zp_flags & ZFS_READONLY) != 0); - XVA_SET_RTN(xvap, XAT_READONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - xoap->xoa_system = - ((pzp->zp_flags & ZFS_SYSTEM) != 0); - XVA_SET_RTN(xvap, XAT_SYSTEM); - } - - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - xoap->xoa_hidden = - ((pzp->zp_flags & ZFS_HIDDEN) != 0); - XVA_SET_RTN(xvap, XAT_HIDDEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - xoap->xoa_nounlink = - ((pzp->zp_flags & ZFS_NOUNLINK) != 0); - XVA_SET_RTN(xvap, XAT_NOUNLINK); - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - xoap->xoa_immutable = - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); - XVA_SET_RTN(xvap, XAT_IMMUTABLE); - } - - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - xoap->xoa_appendonly = - ((pzp->zp_flags & ZFS_APPENDONLY) != 0); - XVA_SET_RTN(xvap, XAT_APPENDONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - xoap->xoa_nodump = - ((pzp->zp_flags & ZFS_NODUMP) != 0); - XVA_SET_RTN(xvap, XAT_NODUMP); - } - - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - xoap->xoa_opaque = - ((pzp->zp_flags & ZFS_OPAQUE) != 0); - XVA_SET_RTN(xvap, XAT_OPAQUE); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - xoap->xoa_av_quarantined = - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); - XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - xoap->xoa_av_modified = - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); - XVA_SET_RTN(xvap, XAT_AV_MODIFIED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - vp->v_type == VREG && - (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - - /* - * Only VREG files have anti-virus scanstamps, so we - * won't conflict with symlinks in the bonus buffer. - */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len <= doi.doi_bonus_size) { - /* - * pzp points to the start of the - * znode_phys_t. pzp + 1 points to the - * first byte after the znode_phys_t. - */ - (void) memcpy(xoap->xoa_av_scanstamp, - pzp + 1, - sizeof (xoap->xoa_av_scanstamp)); - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); - XVA_SET_RTN(xvap, XAT_CREATETIME); - } - } - - ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); - ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); - ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); - - mutex_exit(&zp->z_lock); - - dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); - - if (zp->z_blksz == 0) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - vap->va_blksize = zfsvfs->z_max_blksz; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Set the file attributes to the values contained in the - * vattr structure. - * - * IN: vp - vnode of file to be modified. - * vap - new attribute values. - * If AT_XVATTR set, then optional attrs are being set - * flags - ATTR_UTIME set if non-default time values provided. - * - ATTR_NOACLCHECK (CIFS context only). - * cr - credentials of caller. - * ct - caller context - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime updated, mtime updated if size changed. - */ -/* ARGSUSED */ -static int -zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - znode_phys_t *pzp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog; - dmu_tx_t *tx; - vattr_t oldva; - uint_t mask = vap->va_mask; - uint_t saved_mask; - int trim_mask = 0; - uint64_t new_mode; - znode_t *attrzp; - int need_policy = FALSE; - int err; - zfs_fuid_info_t *fuidp = NULL; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap; - zfs_acl_t *aclp = NULL; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - if (mask == 0) - return (0); - - if (mask & AT_NOSET) - return (EINVAL); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - pzp = zp->z_phys; - zilog = zfsvfs->z_log; - - /* - * Make sure that if we have ephemeral uid/gid or xvattr specified - * that file system is at proper version level - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || - (mask & AT_XVATTR))) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - if (mask & AT_SIZE && vp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (EISDIR); - } - - if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * If this is an xvattr_t, then get a pointer to the structure of - * optional attributes. If this is NULL, then we have a vattr_t. - */ - xoap = xva_getxoptattr(xvap); - - /* - * Immutable files can only alter immutable bit and atime - */ - if ((pzp->zp_flags & ZFS_IMMUTABLE) && - ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || - ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - /* - * Verify timestamps doesn't overflow 32 bits. - * ZFS can handle large timestamps, but 32bit syscalls can't - * handle times greater than 2039. This check should be removed - * once large timestamps are fully supported. - */ - if (mask & (AT_ATIME | AT_MTIME)) { - if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || - ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { - ZFS_EXIT(zfsvfs); - return (EOVERFLOW); - } - } - -top: - attrzp = NULL; - - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - ZFS_EXIT(zfsvfs); - return (EROFS); - } - - /* - * First validate permissions - */ - - if (mask & AT_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - /* - * XXX - Note, we are not providing any open - * mode flags here (like FNDELAY), so we may - * block if there are locks present... this - * should be addressed in openat(). - */ - do { - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - /* NB: we already did dmu_tx_wait() if necessary */ - } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - if (mask & (AT_ATIME|AT_MTIME) || - ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || - XVA_ISSET_REQ(xvap, XAT_READONLY) || - XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || - XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) - need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); - - if (mask & (AT_UID|AT_GID)) { - int idmask = (mask & (AT_UID|AT_GID)); - int take_owner; - int take_group; - - /* - * NOTE: even if a new mode is being set, - * we may clear S_ISUID/S_ISGID bits. - */ - - if (!(mask & AT_MODE)) - vap->va_mode = pzp->zp_mode; - - /* - * Take ownership or chgrp to group we are a member of - */ - - take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & AT_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); - - /* - * If both AT_UID and AT_GID are set then take_owner and - * take_group must both be set in order to allow taking - * ownership. - * - * Otherwise, send the check through secpolicy_vnode_setattr() - * - */ - - if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || - ((idmask == AT_UID) && take_owner) || - ((idmask == AT_GID) && take_group)) { - if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { - /* - * Remove setuid/setgid for non-privileged users - */ - secpolicy_setid_clear(vap, cr); - trim_mask = (mask & (AT_UID|AT_GID)); - } else { - need_policy = TRUE; - } - } else { - need_policy = TRUE; - } - } - - mutex_enter(&zp->z_lock); - oldva.va_mode = pzp->zp_mode; - zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); - if (mask & AT_XVATTR) { - if ((need_policy == FALSE) && - (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && - xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && - xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && - xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NODUMP) && - xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && - xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || - ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && - ((vp->v_type != VREG && xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { - need_policy = TRUE; - } - } - - mutex_exit(&zp->z_lock); - - if (mask & AT_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { - err = secpolicy_setid_setsticky_clear(vp, vap, - &oldva, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - trim_mask |= AT_MODE; - } else { - need_policy = TRUE; - } - } - - if (need_policy) { - /* - * If trim_mask is set then take ownership - * has been granted or write_acl is present and user - * has the ability to modify mode. In that case remove - * UID|GID and or MODE from mask so that - * secpolicy_vnode_setattr() doesn't revoke it. - */ - - if (trim_mask) { - saved_mask = vap->va_mask; - vap->va_mask &= ~trim_mask; - } - err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - - if (trim_mask) - vap->va_mask |= saved_mask; - } - - /* - * secpolicy_vnode_setattr, or take ownership may have - * changed va_mask - */ - mask = vap->va_mask; - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - - if (mask & AT_MODE) { - uint64_t pmode = pzp->zp_mode; - - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } - if (pzp->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL from old V0 format to new V1 */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - pzp->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - pzp->zp_acl.z_acl_extern_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, - pzp->zp_acl.z_acl_extern_obj, 0, - aclp->z_acl_bytes); - } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - } - - if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); - if (err) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (err); - } - dmu_tx_hold_bonus(tx, attrzp->z_id); - } - - err = dmu_tx_assign(tx, zfsvfs->z_assign); - if (err) { - if (attrzp) - VN_RELE(ZTOV(attrzp)); - - if (aclp) { - zfs_acl_free(aclp); - aclp = NULL; - } - - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } - - dmu_buf_will_dirty(zp->z_dbuf, tx); - - /* - * Set each attribute requested. - * We group settings according to the locks they need to acquire. - * - * Note: you cannot set ctime directly, although it will be - * updated as a side-effect of calling this function. - */ - - mutex_enter(&zp->z_lock); - - if (mask & AT_MODE) { - mutex_enter(&zp->z_acl_lock); - zp->z_phys->zp_mode = new_mode; - err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); - ASSERT3U(err, ==, 0); - mutex_exit(&zp->z_acl_lock); - } - - if (attrzp) - mutex_enter(&attrzp->z_lock); - - if (mask & AT_UID) { - pzp->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - if (attrzp) { - attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - } - } - - if (mask & AT_GID) { - pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, - cr, ZFS_GROUP, tx, &fuidp); - if (attrzp) - attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, - vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); - } - - if (aclp) - zfs_acl_free(aclp); - - if (attrzp) - mutex_exit(&attrzp->z_lock); - - if (mask & AT_ATIME) - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); - - if (mask & AT_MTIME) - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); - - if (mask & AT_SIZE) - zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); - else if (mask != 0) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - /* - * Do this after setting timestamps to prevent timestamp - * update from toggling bit - */ - - if (xoap && (mask & AT_XVATTR)) { - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - - ASSERT(vp->v_type == VREG); - - /* Grow the bonus buffer if necessary. */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len > doi.doi_bonus_size) - VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); - } - zfs_xvattr_set(zp, xvap); - } - - if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - - if (fuidp) - zfs_fuid_info_free(fuidp); - mutex_exit(&zp->z_lock); - - if (attrzp) - VN_RELE(ZTOV(attrzp)); - - dmu_tx_commit(tx); - - ZFS_EXIT(zfsvfs); - return (err); -} - -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - -/* - * Drop locks and release vnodes that were held by zfs_rename_lock(). - */ -static void -zfs_rename_unlock(zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - VN_RELE(ZTOV(zl->zl_znode)); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); - } -} - -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = zp->z_zfsvfs->z_root; - uint64_t *oidp = &zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; - - /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. - */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = &zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); - } - } - - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; - - if (*oidp == szp->z_id) /* We're a descendant of szp */ - return (EINVAL); - - if (*oidp == rootid) /* We've hit the top */ - return (0); - - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; - } - oidp = &zp->z_phys->zp_parent; - rwlp = &zp->z_parent_lock; - rw = RW_READER; - - } while (zp->z_id != sdzp->z_id); - - return (0); -} - -/* - * Move an entry from the provided source directory to the target - * directory. Change the entry name as indicated. - * - * IN: sdvp - Source directory containing the "old entry". - * snm - Old entry name. - * tdvp - Target directory to contain the "new entry". - * tnm - New entry name. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * sdvp,tdvp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, - caller_context_t *ct, int flags) -{ - znode_t *tdzp, *szp, *tzp; - znode_t *sdzp = VTOZ(sdvp); - zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; - zilog_t *zilog; - vnode_t *realvp; - zfs_dirlock_t *sdl, *tdl; - dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr; - int error = 0; - int zflg = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); - zilog = zfsvfs->z_log; - - /* - * Make sure we have the real vp for the target directory. - */ - if (VOP_REALVP(tdvp, &realvp, ct) == 0) - tdvp = realvp; - - if (tdvp->v_vfsp != sdvp->v_vfsp) { - ZFS_EXIT(zfsvfs); - return (EXDEV); - } - - tdzp = VTOZ(tdvp); - ZFS_VERIFY_ZP(tdzp); - if (zfsvfs->z_utf8 && u8_validate(tnm, - strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); - } - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - -top: - szp = NULL; - tzp = NULL; - zl = NULL; - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != - (sdzp->z_phys->zp_flags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. - */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - /* - * First compare the two name arguments without - * considering any case folding. - */ - int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); - - cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); - ASSERT(error == 0 || !zfsvfs->z_utf8); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - /* - * If the file system is case-folding, then we may - * have some more checking to do. A case-folding file - * system is either supporting mixed case sensitivity - * access or is completely case-insensitive. Note - * that the file system is always case preserving. - * - * In mixed sensitivity mode case sensitive behavior - * is the default. FIGNORECASE must be used to - * explicitly request case insensitive behavior. - * - * If the source and target names provided differ only - * by case (e.g., a request to rename 'tim' to 'Tim'), - * we will treat this as a special case in the - * case-insensitive mode: as long as the source name - * is an exact match, we will allow this to proceed as - * a name-change request. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - (zfsvfs->z_case == ZFS_CASE_MIXED && - flags & FIGNORECASE)) && - u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, - &error) == 0) { - /* - * case preserving rename request, require exact - * name matches - */ - zflg |= ZCIEXACT; - zflg &= ~ZCILOOK; - } - } - - if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, - ZEXISTS | zflg, NULL, NULL); - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); - } else { - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, zflg, NULL, NULL); - serr = zfs_dirent_lock(&sdl, - sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, - NULL, NULL); - } - - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - VN_RELE(ZTOV(tzp)); - } - if (strcmp(snm, "..") == 0) - serr = EINVAL; - ZFS_EXIT(zfsvfs); - return (serr); - } - if (terr) { - zfs_dirent_unlock(sdl); - VN_RELE(ZTOV(szp)); - if (strcmp(tnm, "..") == 0) - terr = EINVAL; - ZFS_EXIT(zfsvfs); - return (terr); - } - - /* - * Must have write access at the source to remove the old entry - * and write access at the target to create the new entry. - * Note that if target and source are the same, this can be - * done in a single check. - */ - - if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) - goto out; - - if (ZTOV(szp)->v_type == VDIR) { - /* - * Check to make sure rename is valid. - * Can't do a move like this: /usr/a/b to /usr/a/b/c/d - */ - if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) - goto out; - } - - /* - * Does target exist? - */ - if (tzp) { - /* - * Source and target must be the same type. - */ - if (ZTOV(szp)->v_type == VDIR) { - if (ZTOV(tzp)->v_type != VDIR) { - error = ENOTDIR; - goto out; - } - } else { - if (ZTOV(tzp)->v_type == VDIR) { - error = EISDIR; - goto out; - } - } - /* - * POSIX dictates that when the source and target - * entries refer to the same file object, rename - * must do nothing and exit without error. - */ - if (szp->z_id == tzp->z_id) { - error = 0; - goto out; - } - } - - vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); - if (tzp) - vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); - - /* - * notify the target directory if it is not the same - * as source directory. - */ - if (tdvp != sdvp) { - vnevent_rename_dest_dir(tdvp, ct); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ - dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) - dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ - if (tzp) - dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); - - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; - - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - ASSERT(error == 0); - - zfs_log_rename(zilog, tx, - TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), - sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); - } - } - - dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert the indicated symbolic reference entry into the directory. - * - * IN: dvp - Directory to contain new symbolic link. - * link - Name for new symlink entry. - * vap - Attributes of new entry. - * target - Target path of new symlink. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dvp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, - caller_context_t *ct, int flags) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfs_dirlock_t *dl; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - int len = strlen(link); - int error; - int zflg = ZNEW; - zfs_fuid_info_t *fuidp = NULL; - - ASSERT(vap->va_type == VLNK); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); - } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); - return (ENAMETOOLONG); - } - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - - /* - * Create a new object for the symlink. - * Put the link content into bonus buffer if it will fit; - * otherwise, store it just like any other file data. - */ - if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); - if (len != 0) - bcopy(link, zp->z_phys + 1, len); - } else { - dmu_buf_t *dbp; - - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); - /* - * Nothing can access the znode yet so no locking needed - * for growing the znode's blocksize. - */ - zfs_grow_blocksize(zp, len, tx); - - VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, - zp->z_id, 0, FTAG, &dbp)); - dmu_buf_will_dirty(dbp, tx); - - ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp, FTAG); - } - zp->z_phys->zp_size = len; - - /* - * Insert the new object into the directory. - */ - (void) zfs_link_create(dl, zp, tx, ZNEW); -out: - if (error == 0) { - uint64_t txtype = TX_SYMLINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - } - if (fuidp) - zfs_fuid_info_free(fuidp); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - VN_RELE(ZTOV(zp)); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Return, in the buffer contained in the provided uio structure, - * the symbolic path referred to by vp. - * - * IN: vp - vnode of symbolic link. - * uoip - structure to contain the link path. - * cr - credentials of caller. - * ct - caller context - * - * OUT: uio - structure to contain the link path. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -static int -zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - size_t bufsz; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - bufsz = (size_t)zp->z_phys->zp_size; - if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { - error = uiomove(zp->z_phys + 1, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - } else { - dmu_buf_t *dbp; - error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp, FTAG); - } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert a new entry into directory tdvp referencing svp. - * - * IN: tdvp - Directory to contain new entry. - * svp - vnode of new entry. - * name - name of new entry. - * cr - credentials of caller. - * ct - caller context - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * tdvp - ctime|mtime updated - * svp - ctime updated - */ -/* ARGSUSED */ -static int -zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, - caller_context_t *ct, int flags) -{ - znode_t *dzp = VTOZ(tdvp); - znode_t *tzp, *szp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - vnode_t *realvp; - int error; - int zf = ZNEW; - uid_t owner; - - ASSERT(tdvp->v_type == VDIR); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (VOP_REALVP(svp, &realvp, ct) == 0) - svp = realvp; - - if (svp->v_vfsp != tdvp->v_vfsp) { - ZFS_EXIT(zfsvfs); - return (EXDEV); - } - szp = VTOZ(svp); - ZFS_VERIFY_ZP(szp); - - if (zfsvfs->z_utf8 && u8_validate(name, - strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - -top: - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_phys->zp_flags & ZFS_XATTR) != - (dzp->z_phys->zp_flags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (svp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); - if (owner != crgetuid(cr) && - secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_create(dl, szp, tx, 0); - - if (error == 0) { - uint64_t txtype = TX_LINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_link(zilog, tx, txtype, dzp, szp, name); - } - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (error == 0) { - vnevent_link(svp, ct); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * zfs_null_putapage() is used when the file system has been force - * unmounted. It just drops the pages. - */ -/* ARGSUSED */ -static int -zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) -{ - pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); - return (0); -} - -/* - * Push a page out to disk, klustering if possible. - * - * IN: vp - file to push page to. - * pp - page to push. - * flags - additional flags. - * cr - credentials of caller. - * - * OUT: offp - start of range pushed. - * lenp - len of range pushed. - * - * RETURN: 0 if success - * error code if failure - * - * NOTE: callers must have locked the page to be pushed. On - * exit, the page (and all other pages in the kluster) must be - * unlocked. - */ -/* ARGSUSED */ -static int -zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - dmu_tx_t *tx; - rl_t *rl; - u_offset_t off, koff; - size_t len, klen; - uint64_t filesz; - int err; - - filesz = zp->z_phys->zp_size; - off = pp->p_offset; - len = PAGESIZE; - /* - * If our blocksize is bigger than the page size, try to kluster - * muiltiple pages so that we write a full block (thus avoiding - * a read-modify-write). - */ - if (off < filesz && zp->z_blksz > PAGESIZE) { - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - klen = zp->z_blksz; - koff = P2ALIGN(off, (u_offset_t)klen); - } - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); - pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); - } - ASSERT3U(btop(len), ==, btopr(len)); -top: - rl = zfs_range_lock(zp, off, len, RL_WRITER); - /* - * Can't push pages past end-of-file. - */ - filesz = zp->z_phys->zp_size; - if (off >= filesz) { - /* ignore all pages */ - err = 0; - goto out; - } else if (off + len > filesz) { - int npages = btopr(filesz - off); - page_t *trunc; - - page_list_break(&pp, &trunc, npages); - /* ignore pages past end of file */ - if (trunc) - pvn_write_done(trunc, flags); - len = filesz - off; - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, zp->z_id, off, len); - dmu_tx_hold_bonus(tx, zp->z_id); - err = dmu_tx_assign(tx, zfsvfs->z_assign); - if (err != 0) { - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - zfs_range_unlock(rl); - dmu_tx_wait(tx); - dmu_tx_abort(tx); - err = 0; - goto top; - } - dmu_tx_abort(tx); - goto out; - } - - if (zp->z_blksz <= PAGESIZE) { - caddr_t va = ppmapin(pp, PROT_READ, (caddr_t)-1); - ASSERT3U(len, <=, PAGESIZE); - dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); - ppmapout(va); - } else { - err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); - } - - if (err == 0) { - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); - dmu_tx_commit(tx); - } - -out: - zfs_range_unlock(rl); - pvn_write_done(pp, (err ? B_ERROR : 0) | flags); - if (offp) - *offp = off; - if (lenp) - *lenp = len; - - return (err); -} - -/* - * Copy the portion of the file indicated from pages into the file. - * The pages are stored in a page list attached to the files vnode. - * - * IN: vp - vnode of file to push page data to. - * off - position in file to put data. - * len - amount of data to write. - * flags - flags to control the operation. - * cr - credentials of caller. - * ct - caller context. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp; - size_t io_len; - u_offset_t io_off; - uint64_t filesz; - int error = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (len == 0) { - /* - * Search the entire vp list for pages >= off. - */ - error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, - flags, cr); - goto out; - } - - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off > filesz) { - /* past end of file */ - ZFS_EXIT(zfsvfs); - return (0); - } - - len = MIN(len, filesz - off); - - for (io_off = off; io_off < off + len; io_off += io_len) { - if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { - pp = page_lookup(vp, io_off, - (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); - } else { - pp = page_lookup_nowait(vp, io_off, - (flags & B_FREE) ? SE_EXCL : SE_SHARED); - } - - if (pp != NULL && pvn_getdirty(pp, flags)) { - int err; - - /* - * Found a dirty page to push - */ - err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); - if (err) - error = err; - } else { - io_len = PAGESIZE; - } - } -out: - if ((flags & B_ASYNC) == 0) - zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -void -zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_dbuf == NULL) { - /* - * The fs has been unmounted, or we did a - * suspend/resume and this file no longer exists. - */ - if (vn_has_cached_data(vp)) { - (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, - B_INVAL, cr); - } - - mutex_enter(&zp->z_lock); - vp->v_count = 0; /* count arrives as 1 */ - mutex_exit(&zp->z_lock); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - zfs_znode_free(zp); - return; - } - - /* - * Attempt to push any data in the page cache. If this fails - * we will get kicked out later in zfs_zinactive(). - */ - if (vn_has_cached_data(vp)) { - (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, - cr); - } - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - - zfs_zinactive(zp); - rw_exit(&zfsvfs->z_teardown_inactive_lock); -} - -/* - * Bounds-check the seek operation. - * - * IN: vp - vnode seeking within - * ooff - old file offset - * noffp - pointer to new file offset - * ct - caller context - * - * RETURN: 0 if success - * EINVAL if new offset invalid - */ -/* ARGSUSED */ -static int -zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, - caller_context_t *ct) -{ - if (vp->v_type == VDIR) - return (0); - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} - -/* - * Pre-filter the generic locking function to trap attempts to place - * a mandatory lock on a memory mapped file. - */ -static int -zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, - flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * We are following the UFS semantics with respect to mapcnt - * here: If we see that the file is mapped already, then we will - * return an error, but we don't worry about races between this - * function and zfs_map(). - */ - if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { - ZFS_EXIT(zfsvfs); - return (EAGAIN); - } - error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * If we can't find a page in the cache, we will create a new page - * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering). - */ -static int -zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, - caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) -{ - znode_t *zp = VTOZ(vp); - page_t *pp, *cur_pp; - objset_t *os = zp->z_zfsvfs->z_os; - caddr_t va; - u_offset_t io_off, total; - uint64_t oid = zp->z_id; - size_t io_len; - uint64_t filesz; - int err; - - /* - * If we are only asking for a single page don't bother klustering. - */ - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off >= filesz) - return (EFAULT); - if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { - io_off = off; - io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); - } else { - /* - * Try to fill a kluster of pages (a blocks worth). - */ - size_t klen; - u_offset_t koff; - - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - /* - * It would be ideal to align our offset to the - * blocksize but doing so has resulted in some - * strange application crashes. For now, we - * leave the offset as is and only adjust the - * length if we are off the end of the file. - */ - koff = off; - klen = plsz; - } - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; - ASSERT3U(off, >=, koff); - ASSERT3U(off, <, koff + klen); - pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, koff, klen, 0); - } - if (pp == NULL) { - /* - * Some other thread entered the page before us. - * Return to zfs_getpage to retry the lookup. - */ - *pl = NULL; - return (0); - } - - /* - * Fill the pages in the kluster. - */ - cur_pp = pp; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - ASSERT3U(io_off, ==, cur_pp->p_offset); - va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1); - err = dmu_read(os, oid, io_off, PAGESIZE, va); - ppmapout(va); - if (err) { - /* On error, toss the entire kluster */ - pvn_read_done(pp, B_ERROR); - return (err); - } - cur_pp = cur_pp->p_next; - } -out: - /* - * Fill in the page list array from the kluster. If - * there are too many pages in the kluster, return - * as many pages as possible starting from the desired - * offset `off'. - * NOTE: the page list will always be null terminated. - */ - pvn_plist_init(pp, pl, plsz, off, io_len, rw); - - return (0); -} - -/* - * Return pointers to the pages for the file region [off, off + len] - * in the pl array. If plsz is greater than len, this function may - * also return page pointers from before or after the specified - * region (i.e. some region [off', off' + plsz]). These additional - * pages are only returned if they are already in the cache, or were - * created as part of a klustered read. - * - * IN: vp - vnode of file to get data from. - * off - position in file to get data from. - * len - amount of data to retrieve. - * plsz - length of provided page list. - * seg - segment to obtain pages for. - * addr - virtual address of fault. - * rw - mode of created pages. - * cr - credentials of caller. - * ct - caller context. - * - * OUT: protp - protection mode of created pages. - * pl - list of pages created. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -static int -zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, - page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, - enum seg_rw rw, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp, **pl0 = pl; - int need_unlock = 0, err = 0; - offset_t orig_off; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (protp) - *protp = PROT_ALL; - - /* no faultahead (for now) */ - if (pl == NULL) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* can't fault past EOF */ - if (off >= zp->z_phys->zp_size) { - ZFS_EXIT(zfsvfs); - return (EFAULT); - } - orig_off = off; - - /* - * If we already own the lock, then we must be page faulting - * in the middle of a write to this file (i.e., we are writing - * to this file using data from a mapped region of the file). - */ - if (rw_owner(&zp->z_map_lock) != curthread) { - rw_enter(&zp->z_map_lock, RW_WRITER); - need_unlock = TRUE; - } - - /* - * Loop through the requested range [off, off + len] looking - * for pages. If we don't find a page, we will need to create - * a new page and fill it with data from the file. - */ - while (len > 0) { - if (plsz < PAGESIZE) - break; - if (pp = page_lookup(vp, off, SE_SHARED)) { - *pl++ = pp; - off += PAGESIZE; - addr += PAGESIZE; - len -= PAGESIZE; - plsz -= PAGESIZE; - } else { - err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); - if (err) - goto out; - /* - * klustering may have changed our region - * to be block aligned. - */ - if (((pp = *pl) != 0) && (off != pp->p_offset)) { - int delta = off - pp->p_offset; - len += delta; - off -= delta; - addr -= delta; - } - while (*pl) { - pl++; - off += PAGESIZE; - addr += PAGESIZE; - plsz -= PAGESIZE; - if (len > PAGESIZE) - len -= PAGESIZE; - else - len = 0; - } - } - } - - /* - * Fill out the page array with any pages already in the cache. - */ - while (plsz > 0) { - pp = page_lookup_nowait(vp, off, SE_SHARED); - if (pp == NULL) - break; - *pl++ = pp; - off += PAGESIZE; - plsz -= PAGESIZE; - } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); -out: - /* - * We can't grab the range lock for the page as reader which would - * stop truncation as this leads to deadlock. So we need to recheck - * the file size. - */ - if (orig_off >= zp->z_phys->zp_size) - err = EFAULT; - if (err) { - /* - * Release any pages we have previously locked. - */ - while (pl > pl0) - page_unlock(*--pl); - } - - *pl = NULL; - - if (need_unlock) - rw_exit(&zp->z_map_lock); - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Request a memory map for a section of a file. This code interacts - * with common code and the VM system as follows: - * - * common code calls mmap(), which ends up in smmap_common() - * - * this calls VOP_MAP(), which takes you into (say) zfs - * - * zfs_map() calls as_map(), passing segvn_create() as the callback - * - * segvn_create() creates the new segment and calls VOP_ADDMAP() - * - * zfs_addmap() updates z_mapcnt - */ -/*ARGSUSED*/ -static int -zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - segvn_crargs_t vn_a; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((prot & PROT_WRITE) && - (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | - ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - if ((prot & (PROT_READ | PROT_EXEC)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); - return (EACCES); - } - - if (vp->v_flag & VNOMAP) { - ZFS_EXIT(zfsvfs); - return (ENOSYS); - } - - if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); - return (ENXIO); - } - - if (vp->v_type != VREG) { - ZFS_EXIT(zfsvfs); - return (ENODEV); - } - - /* - * If file is locked, disallow mapping. - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { - ZFS_EXIT(zfsvfs); - return (EAGAIN); - } - - as_rangelock(as); - error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); - if (error != 0) { - as_rangeunlock(as); - ZFS_EXIT(zfsvfs); - return (error); - } - - vn_a.vp = vp; - vn_a.offset = (u_offset_t)off; - vn_a.type = flags & MAP_TYPE; - vn_a.prot = prot; - vn_a.maxprot = maxprot; - vn_a.cred = cr; - vn_a.amp = NULL; - vn_a.flags = flags & ~MAP_TYPE; - vn_a.szc = 0; - vn_a.lgrp_mem_policy_flags = 0; - - error = as_map(as, *addrp, len, segvn_create, &vn_a); - - as_rangeunlock(as); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* ARGSUSED */ -static int -zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - uint64_t pages = btopr(len); - - atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); - return (0); -} - -/* - * The reason we push dirty pages as part of zfs_delmap() is so that we get a - * more accurate mtime for the associated file. Since we don't have a way of - * detecting when the data was actually modified, we have to resort to - * heuristics. If an explicit msync() is done, then we mark the mtime when the - * last page is pushed. The problem occurs when the msync() call is omitted, - * which by far the most common case: - * - * open() - * mmap() - * <modify memory> - * munmap() - * close() - * <time lapse> - * putpage() via fsflush - * - * If we wait until fsflush to come along, we can have a modification time that - * is some arbitrary point in the future. In order to prevent this in the - * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is - * torn down. - */ -/* ARGSUSED */ -static int -zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, - size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - uint64_t pages = btopr(len); - - ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); - atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - - return (0); -} - -/* - * Free or allocate space in a file. Currently, this function only - * supports the `F_FREESP' command. However, this command is somewhat - * misnamed, as its functionality includes the ability to allocate as - * well as free space. - * - * IN: vp - vnode of file to free data in. - * cmd - action to take (only F_FREESP supported). - * bfp - section of file to free/alloc. - * flag - current file open mode flags. - * offset - current file offset. - * cr - credentials of caller [UNUSED]. - * ct - caller context. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime|mtime updated - */ -/* ARGSUSED */ -static int -zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, - offset_t offset, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t off, len; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - -top: - if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - if (error = convoff(vp, bfp, 0, offset)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - - off = bfp->l_start; - len = bfp->l_len; /* 0 means from off to end of file */ - - do { - error = zfs_freesp(zp, off, len, flag, TRUE); - /* NB: we already did dmu_tx_wait() if necessary */ - } while (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -static int -zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint32_t gen; - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int size, i; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - gen = (uint32_t)zp->z_gen; - - size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; - if (fidp->fid_len < size) { - fidp->fid_len = size; - ZFS_EXIT(zfsvfs); - return (ENOSPC); - } - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = size; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* Must have a non-zero generation number to distinguish from .zfs */ - if (gen == 0) - gen = 1; - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - if (size == LONG_FID_LEN) { - uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); - zfid_long_t *zlfid; - - zlfid = (zfid_long_t *)fidp; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); - - /* XXX - this should be the generation number for the objset */ - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - zlfid->zf_setgen[i] = 0; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp, *xzp; - zfsvfs_t *zfsvfs; - zfs_dirlock_t *dl; - int error; - - switch (cmd) { - case _PC_LINK_MAX: - *valp = ULONG_MAX; - return (0); - - case _PC_FILESIZEBITS: - *valp = 64; - return (0); - - case _PC_XATTR_EXISTS: - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - *valp = 0; - error = zfs_dirent_lock(&dl, zp, "", &xzp, - ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); - if (error == 0) { - zfs_dirent_unlock(dl); - if (!zfs_dirempty(xzp)) - *valp = 1; - VN_RELE(ZTOV(xzp)); - } else if (error == ENOENT) { - /* - * If there aren't extended attributes, it's the - * same as having zero of them. - */ - error = 0; - } - ZFS_EXIT(zfsvfs); - return (error); - - case _PC_SATTR_ENABLED: - case _PC_SATTR_EXISTS: - *valp = vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) && - (vp->v_type == VREG || vp->v_type == VDIR); - return (0); - - case _PC_ACL_ENABLED: - *valp = _ACL_ACE_ENABLED; - return (0); - - case _PC_MIN_HOLE_SIZE: - *valp = (ulong_t)SPA_MINBLOCKSIZE; - return (0); - - default: - return (fs_pathconf(vp, cmd, valp, cr, ct)); - } -} - -/*ARGSUSED*/ -static int -zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -static int -zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Predeclare these here so that the compiler assumes that - * this is an "old style" function declaration that does - * not include arguments => we won't get type mismatch errors - * in the initializations that follow. - */ -static int zfs_inval(); -static int zfs_isdir(); - -static int -zfs_inval() -{ - return (EINVAL); -} - -static int -zfs_isdir() -{ - return (EISDIR); -} -/* - * Directory vnode operations template - */ -vnodeops_t *zfs_dvnodeops; -const fs_operation_def_t zfs_dvnodeops_template[] = { - VOPNAME_OPEN, { .vop_open = zfs_open }, - VOPNAME_CLOSE, { .vop_close = zfs_close }, - VOPNAME_READ, { .error = zfs_isdir }, - VOPNAME_WRITE, { .error = zfs_isdir }, - VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, - VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, - VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, - VOPNAME_ACCESS, { .vop_access = zfs_access }, - VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, - VOPNAME_CREATE, { .vop_create = zfs_create }, - VOPNAME_REMOVE, { .vop_remove = zfs_remove }, - VOPNAME_LINK, { .vop_link = zfs_link }, - VOPNAME_RENAME, { .vop_rename = zfs_rename }, - VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, - VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, - VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, - VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, - VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, - VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, - VOPNAME_FID, { .vop_fid = zfs_fid }, - VOPNAME_SEEK, { .vop_seek = zfs_seek }, - VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, - VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, - VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, - VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, - NULL, NULL -}; - -/* - * Regular file vnode operations template - */ -vnodeops_t *zfs_fvnodeops; -const fs_operation_def_t zfs_fvnodeops_template[] = { - VOPNAME_OPEN, { .vop_open = zfs_open }, - VOPNAME_CLOSE, { .vop_close = zfs_close }, - VOPNAME_READ, { .vop_read = zfs_read }, - VOPNAME_WRITE, { .vop_write = zfs_write }, - VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, - VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, - VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, - VOPNAME_ACCESS, { .vop_access = zfs_access }, - VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, - VOPNAME_RENAME, { .vop_rename = zfs_rename }, - VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, - VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, - VOPNAME_FID, { .vop_fid = zfs_fid }, - VOPNAME_SEEK, { .vop_seek = zfs_seek }, - VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, - VOPNAME_SPACE, { .vop_space = zfs_space }, - VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, - VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, - VOPNAME_MAP, { .vop_map = zfs_map }, - VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, - VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, - VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, - VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, - VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, - VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, - NULL, NULL -}; - -/* - * Symbolic link vnode operations template - */ -vnodeops_t *zfs_symvnodeops; -const fs_operation_def_t zfs_symvnodeops_template[] = { - VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, - VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, - VOPNAME_ACCESS, { .vop_access = zfs_access }, - VOPNAME_RENAME, { .vop_rename = zfs_rename }, - VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, - VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, - VOPNAME_FID, { .vop_fid = zfs_fid }, - VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, - VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, - NULL, NULL -}; - -/* - * Extended attribute directory vnode operations template - * This template is identical to the directory vnodes - * operation template except for restricted operations: - * VOP_MKDIR() - * VOP_SYMLINK() - * Note that there are other restrictions embedded in: - * zfs_create() - restrict type to VREG - * zfs_link() - no links into/out of attribute space - * zfs_rename() - no moves into/out of attribute space - */ -vnodeops_t *zfs_xdvnodeops; -const fs_operation_def_t zfs_xdvnodeops_template[] = { - VOPNAME_OPEN, { .vop_open = zfs_open }, - VOPNAME_CLOSE, { .vop_close = zfs_close }, - VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, - VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, - VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, - VOPNAME_ACCESS, { .vop_access = zfs_access }, - VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, - VOPNAME_CREATE, { .vop_create = zfs_create }, - VOPNAME_REMOVE, { .vop_remove = zfs_remove }, - VOPNAME_LINK, { .vop_link = zfs_link }, - VOPNAME_RENAME, { .vop_rename = zfs_rename }, - VOPNAME_MKDIR, { .error = zfs_inval }, - VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, - VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, - VOPNAME_SYMLINK, { .error = zfs_inval }, - VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, - VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, - VOPNAME_FID, { .vop_fid = zfs_fid }, - VOPNAME_SEEK, { .vop_seek = zfs_seek }, - VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, - VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, - VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, - VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, - NULL, NULL -}; - -/* - * Error vnode operations template - */ -vnodeops_t *zfs_evnodeops; -const fs_operation_def_t zfs_evnodeops_template[] = { - VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, - VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, - NULL, NULL -}; diff --git a/zfs/lib/libdmu-ctl/zvol.c b/zfs/lib/libdmu-ctl/zvol.c deleted file mode 100644 index 5d16a4d1f..000000000 --- a/zfs/lib/libdmu-ctl/zvol.c +++ /dev/null @@ -1,1830 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "@(#)zvol.c 1.31 08/04/09 SMI" - -/* - * ZFS volume emulation driver. - * - * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. - * Volumes are accessed through the symbolic links named: - * - * /dev/zvol/dsk/<pool_name>/<dataset_name> - * /dev/zvol/rdsk/<pool_name>/<dataset_name> - * - * These links are created by the ZFS-specific devfsadm link generator. - * Volumes are persistent through reboot. No user command needs to be - * run before opening and using a device. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/errno.h> -#include <sys/uio.h> -#include <sys/buf.h> -#include <sys/modctl.h> -#include <sys/open.h> -#include <sys/kmem.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/stat.h> -#include <sys/zap.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu_traverse.h> -#include <sys/dnode.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_prop.h> -#include <sys/dkio.h> -#include <sys/efi_partition.h> -#include <sys/byteorder.h> -#include <sys/pathname.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/crc32.h> -#include <sys/dirent.h> -#include <sys/policy.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_ioctl.h> -#include <sys/mkdev.h> -#include <sys/zil.h> -#include <sys/refcount.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_rlock.h> -#include <sys/vdev_disk.h> -#include <sys/vdev_impl.h> -#include <sys/zvol.h> -#include <sys/dumphdr.h> - -#include "zfs_namecheck.h" - -static void *zvol_state; - -#define ZVOL_DUMPSIZE "dumpsize" - -/* - * This lock protects the zvol_state structure from being modified - * while it's being used, e.g. an open that comes in before a create - * finishes. It also protects temporary opens of the dataset so that, - * e.g., an open doesn't get a spurious EBUSY. - */ -static kmutex_t zvol_state_lock; -static uint32_t zvol_minors; - -#define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t)) - -typedef struct zvol_extent { - dva_t ze_dva; /* dva associated with this extent */ - uint64_t ze_stride; /* extent stride */ - uint64_t ze_size; /* number of blocks in extent */ -} zvol_extent_t; - -/* - * The list of extents associated with the dump device - */ -typedef struct zvol_ext_list { - zvol_extent_t zl_extents[NUM_EXTENTS]; - struct zvol_ext_list *zl_next; -} zvol_ext_list_t; - -/* - * The in-core state of each volume. - */ -typedef struct zvol_state { - char zv_name[MAXPATHLEN]; /* pool/dd name */ - uint64_t zv_volsize; /* amount of space we advertise */ - uint64_t zv_volblocksize; /* volume block size */ - minor_t zv_minor; /* minor number */ - uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_flags; /* readonly; dumpified */ - objset_t *zv_objset; /* objset handle */ - uint32_t zv_mode; /* DS_MODE_* flags at open time */ - uint32_t zv_open_count[OTYPCNT]; /* open counts */ - uint32_t zv_total_opens; /* total open count */ - zilog_t *zv_zilog; /* ZIL handle */ - zvol_ext_list_t *zv_list; /* List of extents for dump */ - uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ - znode_t zv_znode; /* for range locking */ -} zvol_state_t; - -/* - * zvol specific flags - */ -#define ZVOL_RDONLY 0x1 -#define ZVOL_DUMPIFIED 0x2 - -/* - * zvol maximum transfer in one DMU tx. - */ -int zvol_maxphys = DMU_MAX_ACCESS/2; - -extern int zfs_set_prop_nvlist(const char *, nvlist_t *); -static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); -static int zvol_dumpify(zvol_state_t *zv); -static int zvol_dump_fini(zvol_state_t *zv); -static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); - -static void -zvol_size_changed(zvol_state_t *zv, major_t maj) -{ - dev_t dev = makedevice(maj, zv->zv_minor); - - VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Size", zv->zv_volsize) == DDI_SUCCESS); - VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); - - /* Notify specfs to invalidate the cached size */ - spec_size_invalidate(dev, VBLK); - spec_size_invalidate(dev, VCHR); -} - -int -zvol_check_volsize(uint64_t volsize, uint64_t blocksize) -{ - if (volsize == 0) - return (EINVAL); - - if (volsize % blocksize != 0) - return (EINVAL); - -#ifdef _ILP32 - if (volsize - 1 > SPEC_MAXOFFSET_T) - return (EOVERFLOW); -#endif - return (0); -} - -int -zvol_check_volblocksize(uint64_t volblocksize) -{ - if (volblocksize < SPA_MINBLOCKSIZE || - volblocksize > SPA_MAXBLOCKSIZE || - !ISP2(volblocksize)) - return (EDOM); - - return (0); -} - -static void -zvol_readonly_changed_cb(void *arg, uint64_t newval) -{ - zvol_state_t *zv = arg; - - if (newval) - zv->zv_flags |= ZVOL_RDONLY; - else - zv->zv_flags &= ~ZVOL_RDONLY; -} - -int -zvol_get_stats(objset_t *os, nvlist_t *nv) -{ - int error; - dmu_object_info_t doi; - uint64_t val; - - - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); - if (error) - return (error); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); - - error = dmu_object_info(os, ZVOL_OBJ, &doi); - - if (error == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, - doi.doi_data_block_size); - } - - return (error); -} - -/* - * Find a free minor number. - */ -static minor_t -zvol_minor_alloc(void) -{ - minor_t minor; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - - for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) - if (ddi_get_soft_state(zvol_state, minor) == NULL) - return (minor); - - return (0); -} - -static zvol_state_t * -zvol_minor_lookup(const char *name) -{ - minor_t minor; - zvol_state_t *zv; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - - for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - continue; - if (strcmp(zv->zv_name, name) == 0) - break; - } - - return (zv); -} - -void -zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp) -{ - ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ - ze->ze_stride = 0; - ze->ze_size = 1; -} - -/* extent mapping arg */ -struct maparg { - zvol_ext_list_t *ma_list; - zvol_extent_t *ma_extent; - int ma_gang; -}; - -/*ARGSUSED*/ -static int -zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg) -{ - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - void *data = bc->bc_data; - dnode_phys_t *dnp = bc->bc_dnode; - struct maparg *ma = (struct maparg *)arg; - uint64_t stride; - - /* If there is an error, then keep trying to make progress */ - if (bc->bc_errno) - return (ERESTART); - -#ifdef ZFS_DEBUG - if (zb->zb_level == -1) { - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT3U(BP_GET_LEVEL(bp), ==, 0); - } else { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } - - if (zb->zb_level > 0) { - uint64_t fill = 0; - blkptr_t *bpx, *bpend; - - for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); - bpx < bpend; bpx++) { - if (bpx->blk_birth != 0) { - fill += bpx->blk_fill; - } else { - ASSERT(bpx->blk_fill == 0); - } - } - ASSERT3U(fill, ==, bp->blk_fill); - } - - if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { - uint64_t fill = 0; - dnode_phys_t *dnx, *dnend; - - for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); - dnx < dnend; dnx++) { - if (dnx->dn_type != DMU_OT_NONE) - fill++; - } - ASSERT3U(fill, ==, bp->blk_fill); - } -#endif - - if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE) - return (0); - - /* Abort immediately if we have encountered gang blocks */ - if (BP_IS_GANG(bp)) { - ma->ma_gang++; - return (EINTR); - } - - /* first time? */ - if (ma->ma_extent->ze_size == 0) { - zvol_init_extent(ma->ma_extent, bp); - return (0); - } - - stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) - - ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) + - (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride)); - if (DVA_GET_VDEV(BP_IDENTITY(bp)) == - DVA_GET_VDEV(&ma->ma_extent->ze_dva)) { - if (ma->ma_extent->ze_stride == 0) { - /* second block in this extent */ - ma->ma_extent->ze_stride = stride; - ma->ma_extent->ze_size++; - return (0); - } else if (ma->ma_extent->ze_stride == stride) { - /* - * the block we allocated has the same - * stride - */ - ma->ma_extent->ze_size++; - return (0); - } - } - - /* - * dtrace -n 'zfs-dprintf - * /stringof(arg0) == "zvol.c"/ - * { - * printf("%s: %s", stringof(arg1), stringof(arg3)) - * } ' - */ - dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n", - ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride); - dprintf_bp(bp, "%s", "next blkptr:"); - /* start a new extent */ - if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) { - ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t), - KM_SLEEP); - ma->ma_list = ma->ma_list->zl_next; - ma->ma_extent = &ma->ma_list->zl_extents[0]; - } else { - ma->ma_extent++; - } - zvol_init_extent(ma->ma_extent, bp); - return (0); -} - -/* ARGSUSED */ -void -zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) -{ - zfs_creat_t *zct = arg; - nvlist_t *nvprops = zct->zct_props; - int error; - uint64_t volblocksize, volsize; - - VERIFY(nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); - if (nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) - volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); - - /* - * These properties must be removed from the list so the generic - * property setting step won't apply to them. - */ - VERIFY(nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); - (void) nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); - - error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); - ASSERT(error == 0); -} - -/* - * Replay a TX_WRITE ZIL transaction that didn't get committed - * after a system failure - */ -static int -zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) -{ - objset_t *os = zv->zv_objset; - char *data = (char *)(lr + 1); /* data follows lr_write_t */ - uint64_t off = lr->lr_offset; - uint64_t len = lr->lr_length; - dmu_tx_t *tx; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); - error = dmu_tx_assign(tx, zv->zv_txg_assign); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, len, data, tx); - dmu_tx_commit(tx); - } - - return (error); -} - -/* ARGSUSED */ -static int -zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) -{ - return (ENOTSUP); -} - -/* - * Callback vectors for replaying records. - * Only TX_WRITE is needed for zvol. - */ -zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { - zvol_replay_err, /* 0 no such transaction type */ - zvol_replay_err, /* TX_CREATE */ - zvol_replay_err, /* TX_MKDIR */ - zvol_replay_err, /* TX_MKXATTR */ - zvol_replay_err, /* TX_SYMLINK */ - zvol_replay_err, /* TX_REMOVE */ - zvol_replay_err, /* TX_RMDIR */ - zvol_replay_err, /* TX_LINK */ - zvol_replay_err, /* TX_RENAME */ - zvol_replay_write, /* TX_WRITE */ - zvol_replay_err, /* TX_TRUNCATE */ - zvol_replay_err, /* TX_SETATTR */ - zvol_replay_err, /* TX_ACL */ -}; - -/* - * reconstruct dva that gets us to the desired offset (offset - * is in bytes) - */ -int -zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva) -{ - zvol_ext_list_t *zl; - zvol_extent_t *ze; - int idx; - uint64_t tmp; - - if ((zl = zv->zv_list) == NULL) - return (EIO); - idx = 0; - ze = &zl->zl_extents[0]; - while (offset >= ze->ze_size * zv->zv_volblocksize) { - offset -= ze->ze_size * zv->zv_volblocksize; - - if (idx == NUM_EXTENTS - 1) { - /* we've reached the end of this array */ - ASSERT(zl->zl_next != NULL); - if (zl->zl_next == NULL) - return (-1); - zl = zl->zl_next; - ze = &zl->zl_extents[0]; - idx = 0; - } else { - ze++; - idx++; - } - } - DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva)); - tmp = DVA_GET_OFFSET((&ze->ze_dva)); - tmp += (ze->ze_stride * (offset / zv->zv_volblocksize)); - DVA_SET_OFFSET(dva, tmp); - return (0); -} - -static void -zvol_free_extents(zvol_state_t *zv) -{ - zvol_ext_list_t *zl; - zvol_ext_list_t *tmp; - - if (zv->zv_list != NULL) { - zl = zv->zv_list; - while (zl != NULL) { - tmp = zl->zl_next; - kmem_free(zl, sizeof (zvol_ext_list_t)); - zl = tmp; - } - zv->zv_list = NULL; - } -} - -int -zvol_get_lbas(zvol_state_t *zv) -{ - struct maparg ma; - zvol_ext_list_t *zl; - zvol_extent_t *ze; - uint64_t blocks = 0; - int err; - - ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP); - ma.ma_extent = &ma.ma_list->zl_extents[0]; - ma.ma_gang = 0; - zv->zv_list = ma.ma_list; - - err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma); - if (err == EINTR && ma.ma_gang) { - /* - * We currently don't support dump devices when the pool - * is so fragmented that our allocation has resulted in - * gang blocks. - */ - zvol_free_extents(zv); - return (EFRAGS); - } - ASSERT3U(err, ==, 0); - - ze = &zl->zl_extents[0]; - while (ze) { - blocks += ze->ze_size; - if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) { - zl = zl->zl_next; - ze = &zl->zl_extents[0]; - } else { - ze++; - } - } - if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) { - zvol_free_extents(zv); - return (EIO); - } - - return (0); -} - -/* - * Create a minor node (plus a whole lot more) for the specified volume. - */ -int -zvol_create_minor(const char *name, major_t maj) -{ - zvol_state_t *zv; - objset_t *os; - dmu_object_info_t doi; - uint64_t volsize; - minor_t minor = 0; - struct pathname linkpath; - int ds_mode = DS_MODE_PRIMARY; - vnode_t *vp = NULL; - char *devpath; - size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1; - char chrbuf[30], blkbuf[30]; - int error; - - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) != NULL) { - mutex_exit(&zvol_state_lock); - return (EEXIST); - } - - if (strchr(name, '@') != 0) - ds_mode |= DS_MODE_READONLY; - - error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); - - if (error) { - mutex_exit(&zvol_state_lock); - return (error); - } - - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - - if (error) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (error); - } - - /* - * If there's an existing /dev/zvol symlink, try to use the - * same minor number we used last time. - */ - devpath = kmem_alloc(devpathlen, KM_SLEEP); - - (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name); - - error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); - - kmem_free(devpath, devpathlen); - - if (error == 0 && vp->v_type != VLNK) - error = EINVAL; - - if (error == 0) { - pn_alloc(&linkpath); - error = pn_getsymlink(vp, &linkpath, kcred); - if (error == 0) { - char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); - if (ms != NULL) { - ms += strlen(ZVOL_PSEUDO_DEV); - minor = stoi(&ms); - } - } - pn_free(&linkpath); - } - - if (vp != NULL) - VN_RELE(vp); - - /* - * If we found a minor but it's already in use, we must pick a new one. - */ - if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) - minor = 0; - - if (minor == 0) - minor = zvol_minor_alloc(); - - if (minor == 0) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (EAGAIN); - } - - (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, - (char *)name); - - (void) sprintf(chrbuf, "%uc,raw", minor); - - if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_free(zvol_state, minor); - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (EAGAIN); - } - - (void) sprintf(blkbuf, "%uc", minor); - - if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_remove_minor_node(zfs_dip, chrbuf); - ddi_soft_state_free(zvol_state, minor); - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (EAGAIN); - } - - zv = ddi_get_soft_state(zvol_state, minor); - - (void) strcpy(zv->zv_name, name); - zv->zv_min_bs = DEV_BSHIFT; - zv->zv_minor = minor; - zv->zv_volsize = volsize; - zv->zv_objset = os; - zv->zv_mode = ds_mode; - zv->zv_zilog = zil_open(os, zvol_get_data); - mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); - /* get and cache the blocksize */ - error = dmu_object_info(os, ZVOL_OBJ, &doi); - ASSERT(error == 0); - zv->zv_volblocksize = doi.doi_data_block_size; - - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); - zvol_size_changed(zv, maj); - - /* XXX this should handle the possible i/o error */ - VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); - - zvol_minors++; - - mutex_exit(&zvol_state_lock); - - return (0); -} - -/* - * Remove minor node for the specified volume. - */ -int -zvol_remove_minor(const char *name) -{ - zvol_state_t *zv; - char namebuf[30]; - - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - if (zv->zv_total_opens != 0) { - mutex_exit(&zvol_state_lock); - return (EBUSY); - } - - (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); - ddi_remove_minor_node(zfs_dip, namebuf); - - (void) sprintf(namebuf, "%uc", zv->zv_minor); - ddi_remove_minor_node(zfs_dip, namebuf); - - VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); - - zil_close(zv->zv_zilog); - zv->zv_zilog = NULL; - dmu_objset_close(zv->zv_objset); - zv->zv_objset = NULL; - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); - - ddi_soft_state_free(zvol_state, zv->zv_minor); - - zvol_minors--; - - mutex_exit(&zvol_state_lock); - - return (0); -} - -static int -zvol_truncate(zvol_state_t *zv, uint64_t offset, uint64_t size) -{ - dmu_tx_t *tx; - int error; - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_free(tx, ZVOL_OBJ, offset, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, offset, size, tx); - dmu_tx_commit(tx); - return (0); -} - -int -zvol_prealloc(zvol_state_t *zv) -{ - objset_t *os = zv->zv_objset; - dmu_tx_t *tx; - void *data; - uint64_t refd, avail, usedobjs, availobjs; - uint64_t resid = zv->zv_volsize; - uint64_t off = 0; - - /* Check the space usage before attempting to allocate the space */ - dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); - if (avail < zv->zv_volsize) - return (ENOSPC); - - /* Free old extents if they exist */ - zvol_free_extents(zv); - - /* allocate the blocks by writing each one */ - data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP); - - while (resid != 0) { - int error; - uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - kmem_free(data, SPA_MAXBLOCKSIZE); - (void) zvol_truncate(zv, 0, off); - return (error); - } - dmu_write(os, ZVOL_OBJ, off, bytes, data, tx); - dmu_tx_commit(tx); - off += bytes; - resid -= bytes; - } - kmem_free(data, SPA_MAXBLOCKSIZE); - txg_wait_synced(dmu_objset_pool(os), 0); - - return (0); -} - -int -zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) -{ - dmu_tx_t *tx; - int error; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, - &volsize, tx); - dmu_tx_commit(tx); - - if (error == 0) - error = zvol_truncate(zv, volsize, DMU_OBJECT_END); - - if (error == 0) { - zv->zv_volsize = volsize; - zvol_size_changed(zv, maj); - } - return (error); -} - -int -zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) -{ - zvol_state_t *zv; - int error; - dmu_object_info_t doi; - uint64_t old_volsize = 0ULL; - - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - old_volsize = zv->zv_volsize; - - if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) { - mutex_exit(&zvol_state_lock); - return (error); - } - - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { - mutex_exit(&zvol_state_lock); - return (EROFS); - } - - error = zvol_update_volsize(zv, maj, volsize); - - /* - * Reinitialize the dump area to the new size. If we - * failed to resize the dump area then restore the it back to - * it's original size. - */ - if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) { - if ((error = zvol_dumpify(zv)) != 0 || - (error = dumpvp_resize()) != 0) { - (void) zvol_update_volsize(zv, maj, old_volsize); - error = zvol_dumpify(zv); - } - } - - mutex_exit(&zvol_state_lock); - - return (error); -} - -int -zvol_set_volblocksize(const char *name, uint64_t volblocksize) -{ - zvol_state_t *zv; - dmu_tx_t *tx; - int error; - - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { - mutex_exit(&zvol_state_lock); - return (EROFS); - } - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, - volblocksize, 0, tx); - if (error == ENOTSUP) - error = EBUSY; - dmu_tx_commit(tx); - } - - mutex_exit(&zvol_state_lock); - - return (error); -} - -/*ARGSUSED*/ -int -zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) -{ - minor_t minor = getminor(*devp); - zvol_state_t *zv; - - if (minor == 0) /* This is the control device */ - return (0); - - mutex_enter(&zvol_state_lock); - - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - ASSERT(zv->zv_objset != NULL); - - if ((flag & FWRITE) && - (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) { - mutex_exit(&zvol_state_lock); - return (EROFS); - } - - if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { - zv->zv_open_count[otyp]++; - zv->zv_total_opens++; - } - - mutex_exit(&zvol_state_lock); - - return (0); -} - -/*ARGSUSED*/ -int -zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - - if (minor == 0) /* This is the control device */ - return (0); - - mutex_enter(&zvol_state_lock); - - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - /* - * The next statement is a workaround for the following DDI bug: - * 6343604 specfs race: multiple "last-close" of the same device - */ - if (zv->zv_total_opens == 0) { - mutex_exit(&zvol_state_lock); - return (0); - } - - /* - * If the open count is zero, this is a spurious close. - * That indicates a bug in the kernel / DDI framework. - */ - ASSERT(zv->zv_open_count[otyp] != 0); - ASSERT(zv->zv_total_opens != 0); - - /* - * You may get multiple opens, but only one close. - */ - zv->zv_open_count[otyp]--; - zv->zv_total_opens--; - - mutex_exit(&zvol_state_lock); - - return (0); -} - -static void -zvol_get_done(dmu_buf_t *db, void *vzgd) -{ - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; - - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); - kmem_free(zgd, sizeof (zgd_t)); -} - -/* - * Get data to generate a TX_WRITE intent log record. - */ -static int -zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) -{ - zvol_state_t *zv = arg; - objset_t *os = zv->zv_objset; - dmu_buf_t *db; - rl_t *rl; - zgd_t *zgd; - uint64_t boff; /* block starting offset */ - int dlen = lr->lr_length; /* length of user data */ - int error; - - ASSERT(zio); - ASSERT(dlen != 0); - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) /* immediate write */ - return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); - - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_bp = &lr->lr_blkptr; - - /* - * Lock the range of the block to ensure that when the data is - * written out and its checksum is being calculated that no other - * thread can change the block. - */ - boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); - rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, - RL_READER); - zgd->zgd_rl = rl; - - VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zvol_get_done, zgd); - if (error == 0) - zil_add_block(zv->zv_zilog, &lr->lr_blkptr); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zvol_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - zfs_range_unlock(rl); - kmem_free(zgd, sizeof (zgd_t)); - return (error); -} - -/* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). - */ -ssize_t zvol_immediate_write_sz = 32768; - -static void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) -{ - uint32_t blocksize = zv->zv_volblocksize; - lr_write_t *lr; - - while (len) { - ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); - itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - - itx->itx_wr_state = - len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; - itx->itx_private = zv; - lr = (lr_write_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = nbytes; - lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); - BP_ZERO(&lr->lr_blkptr); - - (void) zil_itx_assign(zv->zv_zilog, itx, tx); - len -= nbytes; - off += nbytes; - } -} - -int -zvol_dumpio(vdev_t *vd, uint64_t size, uint64_t offset, void *addr, - int bflags, int isdump) -{ - vdev_disk_t *dvd; - int direction; - int c; - int numerrors = 0; - - for (c = 0; c < vd->vdev_children; c++) { - if (zvol_dumpio(vd->vdev_child[c], size, offset, - addr, bflags, isdump) != 0) { - numerrors++; - } else if (bflags & B_READ) { - break; - } - } - - if (!vd->vdev_ops->vdev_op_leaf) - return (numerrors < vd->vdev_children ? 0 : EIO); - - if (!vdev_writeable(vd)) - return (EIO); - - dvd = vd->vdev_tsd; - ASSERT3P(dvd, !=, NULL); - direction = bflags & (B_WRITE | B_READ); - ASSERT(ISP2(direction)); - offset += VDEV_LABEL_START_SIZE; - - if (ddi_in_panic() || isdump) { - if (direction & B_READ) - return (EIO); - return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), - lbtodb(size))); - } else { - return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, - direction)); - } -} - -int -zvol_physio(zvol_state_t *zv, int bflags, uint64_t off, - uint64_t size, void *addr, int isdump) -{ - dva_t dva; - vdev_t *vd; - int error; - spa_t *spa = dmu_objset_spa(zv->zv_objset); - - ASSERT(size <= zv->zv_volblocksize); - - /* restrict requests to multiples of the system block size */ - if (P2PHASE(off, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE)) - return (EINVAL); - - if (zvol_get_dva(zv, off, &dva) != 0) - return (EIO); - - spa_config_enter(spa, RW_READER, FTAG); - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); - - error = zvol_dumpio(vd, size, - DVA_GET_OFFSET(&dva) + (off % zv->zv_volblocksize), - addr, bflags & (B_READ | B_WRITE | B_PHYS), isdump); - - spa_config_exit(spa, FTAG); - return (error); -} - -int -zvol_strategy(buf_t *bp) -{ - zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); - uint64_t off, volsize; - size_t size, resid; - char *addr; - objset_t *os; - rl_t *rl; - int error = 0; - boolean_t reading, is_dump = zv->zv_flags & ZVOL_DUMPIFIED; - - if (zv == NULL) { - bioerror(bp, ENXIO); - biodone(bp); - return (0); - } - - if (getminor(bp->b_edev) == 0) { - bioerror(bp, EINVAL); - biodone(bp); - return (0); - } - - if (!(bp->b_flags & B_READ) && - (zv->zv_flags & ZVOL_RDONLY || - zv->zv_mode & DS_MODE_READONLY)) { - bioerror(bp, EROFS); - biodone(bp); - return (0); - } - - off = ldbtob(bp->b_blkno); - volsize = zv->zv_volsize; - - os = zv->zv_objset; - ASSERT(os != NULL); - - bp_mapin(bp); - addr = bp->b_un.b_addr; - resid = bp->b_bcount; - - /* - * There must be no buffer changes when doing a dmu_sync() because - * we can't change the data whilst calculating the checksum. - */ - reading = bp->b_flags & B_READ; - rl = zfs_range_lock(&zv->zv_znode, off, resid, - reading ? RL_READER : RL_WRITER); - - if (resid > volsize - off) /* don't write past the end */ - resid = volsize - off; - - while (resid != 0 && off < volsize) { - - size = MIN(resid, zvol_maxphys); - if (is_dump) { - /* can't straddle a block boundary */ - size = MIN(size, P2END(off, zv->zv_volblocksize) - off); - error = zvol_physio(zv, bp->b_flags, off, size, - addr, 0); - } else if (reading) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr); - } else { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size); - dmu_tx_commit(tx); - } - } - if (error) - break; - off += size; - addr += size; - resid -= size; - } - zfs_range_unlock(rl); - - if ((bp->b_resid = resid) == bp->b_bcount) - bioerror(bp, off > volsize ? EINVAL : error); - - if (!(bp->b_flags & B_ASYNC) && !reading && !zil_disable && !is_dump) - zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); - biodone(bp); - - return (0); -} - -/* - * Set the buffer count to the zvol maximum transfer. - * Using our own routine instead of the default minphys() - * means that for larger writes we write bigger buffers on X86 - * (128K instead of 56K) and flush the disk write cache less often - * (every zvol_maxphys - currently 1MB) instead of minphys (currently - * 56K on X86 and 128K on sparc). - */ -void -zvol_minphys(struct buf *bp) -{ - if (bp->b_bcount > zvol_maxphys) - bp->b_bcount = zvol_maxphys; -} - -int -zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - int error = 0; - uint64_t size; - uint64_t boff; - uint64_t resid; - - if (minor == 0) /* This is the control device */ - return (ENXIO); - - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - return (ENXIO); - - boff = ldbtob(blkno); - resid = ldbtob(nblocks); - if (boff + resid > zv->zv_volsize) { - /* dump should know better than to write here */ - ASSERT(blkno + resid <= zv->zv_volsize); - return (EIO); - } - while (resid) { - /* can't straddle a block boundary */ - size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); - - error = zvol_physio(zv, B_WRITE, boff, size, addr, 1); - if (error) - break; - boff += size; - addr += size; - resid -= size; - } - - return (error); -} - -/*ARGSUSED*/ -int -zvol_read(dev_t dev, uio_t *uio, cred_t *cr) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - rl_t *rl; - int error = 0; - - if (minor == 0) /* This is the control device */ - return (ENXIO); - - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - return (ENXIO); - - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); - while (uio->uio_resid > 0) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - - error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); - if (error) - break; - } - zfs_range_unlock(rl); - return (error); -} - -/*ARGSUSED*/ -int -zvol_write(dev_t dev, uio_t *uio, cred_t *cr) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - rl_t *rl; - int error = 0; - - if (minor == 0) /* This is the control device */ - return (ENXIO); - - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - return (ENXIO); - - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_WRITE, - zvol_minphys, uio); - return (error); - } - - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); - while (uio->uio_resid > 0) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio->uio_loffset; - - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - break; - } - error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); - if (error == 0) - zvol_log_write(zv, tx, off, bytes); - dmu_tx_commit(tx); - - if (error) - break; - } - zfs_range_unlock(rl); - return (error); -} - -/* - * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). - */ -/*ARGSUSED*/ -int -zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) -{ - zvol_state_t *zv; - struct dk_cinfo dki; - struct dk_minfo dkm; - dk_efi_t efi; - struct dk_callback *dkc; - struct uuid uuid = EFI_RESERVED; - uint32_t crc; - int error = 0; - rl_t *rl; - - mutex_enter(&zvol_state_lock); - - zv = ddi_get_soft_state(zvol_state, getminor(dev)); - - if (zv == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - switch (cmd) { - - case DKIOCINFO: - bzero(&dki, sizeof (dki)); - (void) strcpy(dki.dki_cname, "zvol"); - (void) strcpy(dki.dki_dname, "zvol"); - dki.dki_ctype = DKC_UNKNOWN; - dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); - mutex_exit(&zvol_state_lock); - if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) - error = EFAULT; - return (error); - - case DKIOCGMEDIAINFO: - bzero(&dkm, sizeof (dkm)); - dkm.dki_lbsize = 1U << zv->zv_min_bs; - dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; - dkm.dki_media_type = DK_UNKNOWN; - mutex_exit(&zvol_state_lock); - if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) - error = EFAULT; - return (error); - - case DKIOCGETEFI: - if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) { - mutex_exit(&zvol_state_lock); - return (EFAULT); - } - efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; - - /* - * Some clients may attempt to request a PMBR for the - * zvol. Currently this interface will return ENOTTY to - * such requests. These requests could be supported by - * adding a check for lba == 0 and consing up an appropriate - * PMBR. - */ - if (efi.dki_lba == 1) { - efi_gpt_t gpt; - efi_gpe_t gpe; - - bzero(&gpt, sizeof (gpt)); - bzero(&gpe, sizeof (gpe)); - - if (efi.dki_length < sizeof (gpt)) { - mutex_exit(&zvol_state_lock); - return (EINVAL); - } - - gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); - gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); - gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); - gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); - gpt.efi_gpt_LastUsableLBA = - LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); - gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); - gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); - gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe)); - - UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); - gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA; - gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA; - - CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); - gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); - - CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); - gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); - - mutex_exit(&zvol_state_lock); - if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag)) - error = EFAULT; - } else if (efi.dki_lba == 2) { - efi_gpe_t gpe; - - bzero(&gpe, sizeof (gpe)); - - if (efi.dki_length < sizeof (gpe)) { - mutex_exit(&zvol_state_lock); - return (EINVAL); - } - - UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); - gpe.efi_gpe_StartingLBA = LE_64(34ULL); - gpe.efi_gpe_EndingLBA = - LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); - - mutex_exit(&zvol_state_lock); - if (ddi_copyout(&gpe, efi.dki_data, sizeof (gpe), flag)) - error = EFAULT; - } else { - mutex_exit(&zvol_state_lock); - error = EINVAL; - } - return (error); - - case DKIOCFLUSHWRITECACHE: - dkc = (struct dk_callback *)arg; - zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); - if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { - (*dkc->dkc_callback)(dkc->dkc_cookie, error); - error = 0; - } - break; - - case DKIOCGGEOM: - case DKIOCGVTOC: - /* - * commands using these (like prtvtoc) expect ENOTSUP - * since we're emulating an EFI label - */ - error = ENOTSUP; - break; - - case DKIOCDUMPINIT: - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, - RL_WRITER); - error = zvol_dumpify(zv); - zfs_range_unlock(rl); - break; - - case DKIOCDUMPFINI: - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, - RL_WRITER); - error = zvol_dump_fini(zv); - zfs_range_unlock(rl); - break; - - default: - error = ENOTTY; - break; - - } - mutex_exit(&zvol_state_lock); - return (error); -} - -int -zvol_busy(void) -{ - return (zvol_minors != 0); -} - -void -zvol_init(void) -{ - VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); -} - -void -zvol_fini(void) -{ - mutex_destroy(&zvol_state_lock); - ddi_soft_state_fini(&zvol_state); -} - -static boolean_t -zvol_is_swap(zvol_state_t *zv) -{ - vnode_t *vp; - boolean_t ret = B_FALSE; - char *devpath; - size_t devpathlen; - int error; - - devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1; - devpath = kmem_alloc(devpathlen, KM_SLEEP); - (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name); - error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); - kmem_free(devpath, devpathlen); - - ret = !error && IS_SWAPVP(common_specvp(vp)); - - if (vp != NULL) - VN_RELE(vp); - - return (ret); -} - -static int -zvol_dump_init(zvol_state_t *zv, boolean_t resize) -{ - dmu_tx_t *tx; - int error = 0; - objset_t *os = zv->zv_objset; - nvlist_t *nv = NULL; - uint64_t checksum, compress, refresrv; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, ZVOL_OBJ, 0, DMU_OBJECT_END); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - /* - * If we are resizing the dump device then we only need to - * update the refreservation to match the newly updated - * zvolsize. Otherwise, we save off the original state of the - * zvol so that we can restore them if the zvol is ever undumpified. - */ - if (resize) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &zv->zv_volsize, tx); - } else { - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); - - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, - &compress, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &refresrv, tx); - } - dmu_tx_commit(tx); - - /* Truncate the file */ - if (!error) - error = zvol_truncate(zv, 0, DMU_OBJECT_END); - - if (error) - return (error); - - /* - * We only need update the zvol's property if we are initializing - * the dump area for the first time. - */ - if (!resize) { - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), - ZIO_COMPRESS_OFF) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), - ZIO_CHECKSUM_OFF) == 0); - - error = zfs_set_prop_nvlist(zv->zv_name, nv); - nvlist_free(nv); - - if (error) - return (error); - } - - /* Allocate the space for the dump */ - error = zvol_prealloc(zv); - return (error); -} - -static int -zvol_dumpify(zvol_state_t *zv) -{ - int error = 0; - uint64_t dumpsize = 0; - dmu_tx_t *tx; - objset_t *os = zv->zv_objset; - - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) - return (EROFS); - - /* - * We do not support swap devices acting as dump devices. - */ - if (zvol_is_swap(zv)) - return (ENOTSUP); - - if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, - 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { - boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; - - if ((error = zvol_dump_init(zv, resize)) != 0) { - (void) zvol_dump_fini(zv); - return (error); - } - } - - /* - * Build up our lba mapping. - */ - error = zvol_get_lbas(zv); - if (error) { - (void) zvol_dump_fini(zv); - return (error); - } - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - (void) zvol_dump_fini(zv); - return (error); - } - - zv->zv_flags |= ZVOL_DUMPIFIED; - error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, - &zv->zv_volsize, tx); - dmu_tx_commit(tx); - - if (error) { - (void) zvol_dump_fini(zv); - return (error); - } - - txg_wait_synced(dmu_objset_pool(os), 0); - return (0); -} - -static int -zvol_dump_fini(zvol_state_t *zv) -{ - dmu_tx_t *tx; - objset_t *os = zv->zv_objset; - nvlist_t *nv; - int error = 0; - uint64_t checksum, compress, refresrv; - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - /* - * Attempt to restore the zvol back to its pre-dumpified state. - * This is a best-effort attempt as it's possible that not all - * of these properties were initialized during the dumpify process - * (i.e. error during zvol_dump_init). - */ - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); - - (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); - zvol_free_extents(zv); - zv->zv_flags &= ~ZVOL_DUMPIFIED; - dmu_tx_commit(tx); - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); - (void) zfs_set_prop_nvlist(zv->zv_name, nv); - nvlist_free(nv); - - return (0); -} |