diff options
author | Matthew Macy <[email protected]> | 2019-11-21 09:32:57 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-11-21 09:32:57 -0800 |
commit | da92d5cbb38cea3a860b8a6bb8ee21f9129e7d7c (patch) | |
tree | cc2d84b481a30b43d4097603e79a55a1975b0b64 /lib | |
parent | 67a6c3bc9ff401fa04bc41354c5172b51aaed1c9 (diff) |
Add zfs_file_* interface, remove vnodes
Provide a common zfs_file_* interface which can be implemented on all
platforms to perform normal file access from either the kernel module
or the libzpool library.
This allows all non-portable vnode_t usage in the common code to be
replaced by the new portable zfs_file_t. The associated vnode and
kobj compatibility functions, types, and macros have been removed
from the SPL. Moving forward, vnodes should only be used in platform
specific code when provided by the native operating system.
Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Igor Kozhukhov <[email protected]>
Reviewed-by: Jorgen Lundman <[email protected]>
Signed-off-by: Matt Macy <[email protected]>
Closes #9556
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libspl/include/os/linux/sys/Makefile.am | 1 | ||||
-rw-r--r-- | lib/libspl/include/os/linux/sys/file.h | 49 | ||||
-rw-r--r-- | lib/libzpool/kernel.c | 610 |
3 files changed, 376 insertions, 284 deletions
diff --git a/lib/libspl/include/os/linux/sys/Makefile.am b/lib/libspl/include/os/linux/sys/Makefile.am index 6b170fa8c..f8b6d9fae 100644 --- a/lib/libspl/include/os/linux/sys/Makefile.am +++ b/lib/libspl/include/os/linux/sys/Makefile.am @@ -2,7 +2,6 @@ libspldir = $(includedir)/libspl/sys libspl_HEADERS = \ $(top_srcdir)/lib/libspl/include/os/linux/sys/byteorder.h \ $(top_srcdir)/lib/libspl/include/os/linux/sys/errno.h \ - $(top_srcdir)/lib/libspl/include/os/linux/sys/file.h \ $(top_srcdir)/lib/libspl/include/os/linux/sys/mnttab.h \ $(top_srcdir)/lib/libspl/include/os/linux/sys/mount.h \ $(top_srcdir)/lib/libspl/include/os/linux/sys/param.h \ diff --git a/lib/libspl/include/os/linux/sys/file.h b/lib/libspl/include/os/linux/sys/file.h deleted file mode 100644 index e0752ac25..000000000 --- a/lib/libspl/include/os/linux/sys/file.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _LIBSPL_SYS_FILE_H -#define _LIBSPL_SYS_FILE_H - -#include_next <sys/file.h> - -#include <sys/user.h> - -#define FREAD 1 -#define FWRITE 2 -// #define FAPPEND 8 - -#define FCREAT O_CREAT -#define FTRUNC O_TRUNC -#define FOFFMAX O_LARGEFILE -#define FSYNC O_SYNC -#define FDSYNC O_DSYNC -#define FEXCL O_EXCL - -#define FNODSYNC 0x10000 /* fsync pseudo flag */ -#define FNOFOLLOW 0x20000 /* don't follow symlinks */ -#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ - -#endif diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index c14468cb2..ef52ed3af 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -49,7 +49,6 @@ int aok; uint64_t physmem; -vnode_t *rootdir = (vnode_t *)0xabcd1234; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; vmem_t *zio_arena = NULL; @@ -488,183 +487,6 @@ procfs_list_add(procfs_list_t *procfs_list, void *p) * vnode operations * ========================================================================= */ -/* - * Note: for the xxxat() versions of these functions, we assume that the - * starting vp is always rootdir (which is true for spa_directory.c, the only - * ZFS consumer of these interfaces). We assert this is true, and then emulate - * them by adding '/' in front of the path. - */ - -/*ARGSUSED*/ -int -vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) -{ - int fd = -1; - int dump_fd = -1; - vnode_t *vp; - int old_umask = 0; - struct stat64 st; - int err; - - if (!(flags & FCREAT) && stat64(path, &st) == -1) { - err = errno; - return (err); - } - - if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) - flags |= O_DIRECT; - - if (flags & FCREAT) - old_umask = umask(0); - - /* - * The construct 'flags - FREAD' conveniently maps combinations of - * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. - */ - fd = open64(path, flags - FREAD, mode); - if (fd == -1) { - err = errno; - return (err); - } - - if (flags & FCREAT) - (void) umask(old_umask); - - if (vn_dumpdir != NULL) { - char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); - (void) snprintf(dumppath, MAXPATHLEN, - "%s/%s", vn_dumpdir, basename(path)); - dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); - umem_free(dumppath, MAXPATHLEN); - if (dump_fd == -1) { - err = errno; - close(fd); - return (err); - } - } else { - dump_fd = -1; - } - - if (fstat64_blk(fd, &st) == -1) { - err = errno; - close(fd); - if (dump_fd != -1) - close(dump_fd); - return (err); - } - - (void) fcntl(fd, F_SETFD, FD_CLOEXEC); - - *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); - - vp->v_fd = fd; - vp->v_size = st.st_size; - vp->v_path = spa_strdup(path); - vp->v_dump_fd = dump_fd; - - return (0); -} - -/*ARGSUSED*/ -int -vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, - int x3, vnode_t *startvp, int fd) -{ - char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); - int ret; - - ASSERT(startvp == rootdir); - (void) sprintf(realpath, "/%s", path); - - /* fd ignored for now, need if want to simulate nbmand support */ - ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); - - umem_free(realpath, strlen(path) + 2); - - return (ret); -} - -/*ARGSUSED*/ -int -vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, - int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) -{ - ssize_t rc, done = 0, split; - - if (uio == UIO_READ) { - rc = pread64(vp->v_fd, addr, len, offset); - if (vp->v_dump_fd != -1 && rc != -1) { - int status; - status = pwrite64(vp->v_dump_fd, addr, rc, offset); - ASSERT(status != -1); - } - } else { - /* - * To simulate partial disk writes, we split writes into two - * system calls so that the process can be killed in between. - */ - int sectors = len >> SPA_MINBLOCKSHIFT; - split = (sectors > 0 ? rand() % sectors : 0) << - SPA_MINBLOCKSHIFT; - rc = pwrite64(vp->v_fd, addr, split, offset); - if (rc != -1) { - done = rc; - rc = pwrite64(vp->v_fd, (char *)addr + split, - len - split, offset + split); - } - } - -#ifdef __linux__ - if (rc == -1 && errno == EINVAL) { - /* - * Under Linux, this most likely means an alignment issue - * (memory or disk) due to O_DIRECT, so we abort() in order to - * catch the offender. - */ - abort(); - } -#endif - if (rc == -1) - return (errno); - - done += rc; - - if (residp) - *residp = len - done; - else if (done != len) - return (EIO); - return (0); -} - -void -vn_close(vnode_t *vp) -{ - close(vp->v_fd); - if (vp->v_dump_fd != -1) - close(vp->v_dump_fd); - spa_strfree(vp->v_path); - umem_free(vp, sizeof (vnode_t)); -} - -/* - * At a minimum we need to update the size since vdev_reopen() - * will no longer call vn_openat(). - */ -int -fop_getattr(vnode_t *vp, vattr_t *vap) -{ - struct stat64 st; - int err; - - if (fstat64_blk(vp->v_fd, &st) == -1) { - err = errno; - close(vp->v_fd); - return (err); - } - - vap->va_size = st.st_size; - return (0); -} /* * ========================================================================= @@ -860,60 +682,6 @@ cmn_err(int ce, const char *fmt, ...) /* * ========================================================================= - * kobj interfaces - * ========================================================================= - */ -struct _buf * -kobj_open_file(char *name) -{ - struct _buf *file; - vnode_t *vp; - - /* set vp as the _fd field of the file */ - if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, - -1) != 0) - return ((void *)-1UL); - - file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); - file->_fd = (intptr_t)vp; - return (file); -} - -int -kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - ssize_t resid = 0; - - if (vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, - UIO_SYSSPACE, 0, 0, 0, &resid) != 0) - return (-1); - - return (size - resid); -} - -void -kobj_close_file(struct _buf *file) -{ - vn_close((vnode_t *)file->_fd); - umem_free(file, sizeof (struct _buf)); -} - -int -kobj_get_filesize(struct _buf *file, uint64_t *size) -{ - struct stat64 st; - vnode_t *vp = (vnode_t *)file->_fd; - - if (fstat64(vp->v_fd, &st) == -1) { - vn_close(vp); - return (errno); - } - *size = st.st_size; - return (0); -} - -/* - * ========================================================================= * misc routines * ========================================================================= */ @@ -1059,7 +827,7 @@ kernel_init(int mode) (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", - (mode & FWRITE) ? get_system_hostid() : 0); + (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0); random_init(); @@ -1068,7 +836,7 @@ kernel_init(int mode) system_taskq_init(); icp_init(); - spa_init(mode); + spa_init((spa_mode_t)mode); fletcher_4_init(); @@ -1265,3 +1033,377 @@ zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { } + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + int fd = -1; + int dump_fd = -1; + int err; + int old_umask = 0; + zfs_file_t *fp; + struct stat64 st; + + if (!(flags & O_CREAT) && stat64(path, &st) == -1) + return (errno); + + if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) + flags |= O_DIRECT; + + if (flags & O_CREAT) + old_umask = umask(0); + + fd = open64(path, flags, mode); + if (fd == -1) + return (errno); + + if (flags & O_CREAT) + (void) umask(old_umask); + + if (vn_dumpdir != NULL) { + char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); + char *inpath = basename((char *)(uintptr_t)path); + + (void) snprintf(dumppath, MAXPATHLEN, + "%s/%s", vn_dumpdir, inpath); + dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); + umem_free(dumppath, MAXPATHLEN); + if (dump_fd == -1) { + err = errno; + close(fd); + return (err); + } + } else { + dump_fd = -1; + } + + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + + fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); + fp->f_fd = fd; + fp->f_dump_fd = dump_fd; + *fpp = fp; + + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + close(fp->f_fd); + if (fp->f_dump_fd != -1) + close(fp->f_dump_fd); + + umem_free(fp, sizeof (zfs_file_t)); +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + ssize_t rc; + + rc = write(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, + size_t count, loff_t pos, ssize_t *resid) +{ + ssize_t rc, split, done; + int sectors; + + /* + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + * This is used by ztest to simulate realistic failure modes. + */ + sectors = count >> SPA_MINBLOCKSHIFT; + split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; + rc = pwrite64(fp->f_fd, buf, split, pos); + if (rc != -1) { + done = rc; + rc = pwrite64(fp->f_fd, (char *)buf + split, + count - split, pos + split); + } +#ifdef __linux__ + if (rc == -1 && errno == EINVAL) { + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order + * to catch the offender. + */ + abort(); + } +#endif + + if (rc < 0) + return (errno); + + done += rc; + + if (resid) { + *resid = count - done; + } else if (done != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + int rc; + + rc = read(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = pread64(fp->f_fd, buf, count, off); + if (rc < 0) { +#ifdef __linux__ + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order to + * catch the offender. + */ + if (errno == EINVAL) + abort(); +#endif + return (errno); + } + + if (fp->f_dump_fd != -1) { + int status; + + status = pwrite64(fp->f_dump_fd, buf, rc, off); + ASSERT(status != -1); + } + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + loff_t rc; + + rc = lseek(fp->f_fd, *offp, whence); + if (rc < 0) + return (errno); + + *offp = rc; + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct stat64 st; + + if (fstat64_blk(fp->f_fd, &st) == -1) + return (errno); + + zfattr->zfa_size = st.st_size; + zfattr->zfa_mode = st.st_mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + int rc; + + rc = fsync(fp->f_fd); + if (rc < 0) + return (errno); + + return (0); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ + return (fallocate(fp->f_fd, mode, offset, len)); +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (lseek(fp->f_fd, SEEK_CUR, 0)); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (remove(path)); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * fpp - pointer to file pointer + * + * Returns 0 on success EBADF on failure. + * Unsupported in user space. + */ +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + abort(); + + return (EOPNOTSUPP); +} + +/* + * Drop reference to file pointer + * + * fd - input file descriptor + * + * Unsupported in user space. + */ +void +zfs_file_put(int fd) +{ + abort(); +} |