aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristian Schwarz <[email protected]>2020-11-02 21:07:07 +0100
committerTony Hutter <[email protected]>2021-06-23 13:22:15 -0700
commitab2110e3e28897c1f5a7c047781c17a4ceec9b0c (patch)
tree301a134aea0074fb172456b682e01cd6ff682ce6
parenta909190683493f5f73337e15174cc0f87f538f61 (diff)
zfs_vnops: make zfs_get_data OS-independent
Move zfs_get_data() in to platform-independent code. The only platform-specific aspect of it is the way we release an inode (Linux) / vnode_t (FreeBSD). I am not aware of a platform that could be supported by ZFS that couldn't implement zfs_rele_async itself. It's sibling zvol_get_data already is platform-independent. Reviewed-by: Ryan Moeller <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Christian Schwarz <[email protected]> Closes #10979
-rw-r--r--include/os/freebsd/zfs/sys/zfs_znode_impl.h1
-rw-r--r--include/os/linux/zfs/sys/zfs_znode_impl.h1
-rw-r--r--include/sys/zfs_vnops.h13
-rw-r--r--module/os/freebsd/zfs/zfs_vfsops.c1
-rw-r--r--module/os/freebsd/zfs/zfs_vnops_os.c161
-rw-r--r--module/os/linux/zfs/zfs_vnops_os.c172
-rw-r--r--module/zfs/zfs_vnops.c159
7 files changed, 179 insertions, 329 deletions
diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
index b5de7e0b6..ac2625d9a 100644
--- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h
+++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
@@ -173,7 +173,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
extern void zfs_znode_free(struct znode *);
-extern zil_get_data_t zfs_get_data;
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;
diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h
index 79049df69..e95aef6c8 100644
--- a/include/os/linux/zfs/sys/zfs_znode_impl.h
+++ b/include/os/linux/zfs/sys/zfs_znode_impl.h
@@ -173,7 +173,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw);
extern void zfs_unmap_page(page_t *, caddr_t);
#endif /* HAVE_UIO_RW */
-extern zil_get_data_t zfs_get_data;
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;
diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h
index 9c76e4855..6bf077b4b 100644
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@@ -39,4 +39,17 @@ extern int mappedread(znode_t *, int, uio_t *);
extern int mappedread_sf(znode_t *, int, uio_t *);
extern void update_pages(znode_t *, int64_t, int, objset_t *);
+/*
+ * Platform code that asynchronously drops zp's inode / vnode_t.
+ *
+ * Asynchronous dropping ensures that the caller will never drop the
+ * last reference on an inode / vnode_t in the current context.
+ * Doing so while holding open a tx could result in a deadlock if
+ * the platform calls into filesystem again in the implementation
+ * of inode / vnode_t dropping (e.g. call from iput_final()).
+ */
+extern void zfs_zrele_async(znode_t *zp);
+
+extern zil_get_data_t zfs_get_data;
+
#endif
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
index bc414d551..a537342f9 100644
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -42,6 +42,7 @@
#include <sys/mount.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 079f7a2ec..59734e63d 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -29,6 +29,7 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -669,163 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}
-static void
-zfs_get_done(zgd_t *zgd, int error)
-{
- znode_t *zp = zgd->zgd_private;
- objset_t *os = zp->z_zfsvfs->z_os;
-
- if (zgd->zgd_db)
- dmu_buf_rele(zgd->zgd_db, zgd);
-
- zfs_rangelock_exit(zgd->zgd_lr);
-
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
-
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef ZFS_DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+void
+zfs_zrele_async(znode_t *zp)
{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t object = lr->lr_foid;
- uint64_t offset = lr->lr_offset;
- uint64_t size = lr->lr_length;
- dmu_buf_t *db;
- zgd_t *zgd;
- int error = 0;
-
- ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
- ASSERT3U(size, !=, 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, object, &zp) != 0)
- return (SET_ERROR(ENOENT));
- if (zp->z_unlinked) {
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp),
- dsl_pool_zrele_taskq(dmu_objset_pool(os)));
- return (SET_ERROR(ENOENT));
- }
-
- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_lwb = lwb;
- zgd->zgd_private = zp;
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
- size, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (offset >= zp->z_size) {
- error = SET_ERROR(ENOENT);
- } else {
- error = dmu_read(os, object, offset, size, buf,
- DMU_READ_NO_PREFETCH);
- }
- ASSERT(error == 0 || error == ENOENT);
- } else { /* indirect write */
- /*
- * Have to lock the whole block to ensure when it's
- * written out and its checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- uint64_t blkoff;
- size = zp->z_blksz;
- blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
- offset -= blkoff;
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- if (zp->z_blksz == size)
- break;
- offset += blkoff;
- zfs_rangelock_exit(zgd->zgd_lr);
- }
- /* test for truncation needs to be done while range locked */
- if (lr->lr_offset >= zp->z_size)
- error = SET_ERROR(ENOENT);
-#ifdef ZFS_DEBUG
- if (zil_fault_io) {
- error = SET_ERROR(EIO);
- zil_fault_io = 0;
- }
-#endif
- if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
-
- if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
-
- zgd->zgd_db = db;
- zgd->zgd_bp = bp;
-
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
-
- error = dmu_sync(zio, lr->lr_common.lrc_txg,
- zfs_get_done, zgd);
- ASSERT(error || lr->lr_length <= size);
-
- /*
- * On success, we need to wait for the write I/O
- * initiated by dmu_sync() to complete before we can
- * release this dbuf. We will finish everything up
- * in the zfs_get_done() callback.
- */
- if (error == 0)
- return (0);
-
- if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- /*
- * TX_WRITE2 relies on the data previously
- * written by the TX_WRITE that caused
- * EALREADY. We zero out the BP because
- * it is the old, currently-on-disk BP,
- * so there's no need to zio_flush() its
- * vdevs (flushing would needlesly hurt
- * performance, and doesn't work on
- * indirect vdevs).
- */
- zgd->zgd_bp = NULL;
- BP_ZERO(bp);
- error = 0;
- }
- }
- }
-
- zfs_get_done(zgd, error);
+ vnode_t *vp = ZTOV(zp);
+ objset_t *os = ITOZSB(vp)->z_os;
- return (error);
+ VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
}
static int
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 3b1658524..75f33fad7 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -392,12 +392,6 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}
-/*
- * Drop a reference on the passed inode asynchronously. This ensures
- * that the caller will never drop the last reference on an inode in
- * the current context. Doing so while holding open a tx could result
- * in a deadlock if iput_final() re-enters the filesystem code.
- */
void
zfs_zrele_async(znode_t *zp)
{
@@ -421,172 +415,6 @@ zfs_zrele_async(znode_t *zp)
}
}
-/* ARGSUSED */
-static void
-zfs_get_done(zgd_t *zgd, int error)
-{
- znode_t *zp = zgd->zgd_private;
-
- if (zgd->zgd_db)
- dmu_buf_rele(zgd->zgd_db, zgd);
-
- zfs_rangelock_exit(zgd->zgd_lr);
-
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- zfs_zrele_async(zp);
-
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef ZFS_DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
- struct lwb *lwb, zio_t *zio)
-{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t object = lr->lr_foid;
- uint64_t offset = lr->lr_offset;
- uint64_t size = lr->lr_length;
- dmu_buf_t *db;
- zgd_t *zgd;
- int error = 0;
- uint64_t zp_gen;
-
- ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
- ASSERT3U(size, !=, 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, object, &zp) != 0)
- return (SET_ERROR(ENOENT));
- if (zp->z_unlinked) {
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- zfs_zrele_async(zp);
- return (SET_ERROR(ENOENT));
- }
- /* check if generation number matches */
- if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
- sizeof (zp_gen)) != 0) {
- zfs_zrele_async(zp);
- return (SET_ERROR(EIO));
- }
- if (zp_gen != gen) {
- zfs_zrele_async(zp);
- return (SET_ERROR(ENOENT));
- }
-
- zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_lwb = lwb;
- zgd->zgd_private = zp;
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (offset >= zp->z_size) {
- error = SET_ERROR(ENOENT);
- } else {
- error = dmu_read(os, object, offset, size, buf,
- DMU_READ_NO_PREFETCH);
- }
- ASSERT(error == 0 || error == ENOENT);
- } else { /* indirect write */
- /*
- * Have to lock the whole block to ensure when it's
- * written out and its checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- uint64_t blkoff;
- size = zp->z_blksz;
- blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
- offset -= blkoff;
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- if (zp->z_blksz == size)
- break;
- offset += blkoff;
- zfs_rangelock_exit(zgd->zgd_lr);
- }
- /* test for truncation needs to be done while range locked */
- if (lr->lr_offset >= zp->z_size)
- error = SET_ERROR(ENOENT);
-#ifdef ZFS_DEBUG
- if (zil_fault_io) {
- error = SET_ERROR(EIO);
- zil_fault_io = 0;
- }
-#endif
- if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
-
- if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
-
- zgd->zgd_db = db;
- zgd->zgd_bp = bp;
-
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
-
- error = dmu_sync(zio, lr->lr_common.lrc_txg,
- zfs_get_done, zgd);
- ASSERT(error || lr->lr_length <= size);
-
- /*
- * On success, we need to wait for the write I/O
- * initiated by dmu_sync() to complete before we can
- * release this dbuf. We will finish everything up
- * in the zfs_get_done() callback.
- */
- if (error == 0)
- return (0);
-
- if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- /*
- * TX_WRITE2 relies on the data previously
- * written by the TX_WRITE that caused
- * EALREADY. We zero out the BP because
- * it is the old, currently-on-disk BP.
- */
- zgd->zgd_bp = NULL;
- BP_ZERO(bp);
- error = 0;
- }
- }
- }
-
- zfs_get_done(zgd, error);
-
- return (error);
-}
-
/*
* Lookup an entry in a directory, or an extended attribute directory.
* If it exists, return a held inode reference for it.
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 83e08630b..80d343efa 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -52,6 +52,8 @@
#include <sys/policy.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
static ulong_t zfs_fsync_sync_cnt = 4;
@@ -727,6 +729,163 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
return (error);
}
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+static void zfs_get_done(zgd_t *zgd, int error);
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error = 0;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
+ return (SET_ERROR(ENOENT));
+ if (zp->z_unlinked) {
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_zrele_async(zp);
+ return (SET_ERROR(ENOENT));
+ }
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zp;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (offset >= zp->z_size) {
+ error = SET_ERROR(ENOENT);
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ }
+ ASSERT(error == 0 || error == ENOENT);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and its checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ if (zp->z_blksz == size)
+ break;
+ offset += blkoff;
+ zfs_rangelock_exit(zgd->zgd_lr);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (lr->lr_offset >= zp->z_size)
+ error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+ if (zil_fault_io) {
+ error = SET_ERROR(EIO);
+ zil_fault_io = 0;
+ }
+#endif
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= size);
+
+ /*
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
+ */
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP.
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
+ error = 0;
+ }
+ }
+ }
+
+ zfs_get_done(zgd, error);
+
+ return (error);
+}
+
+
+/* ARGSUSED */
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+ znode_t *zp = zgd->zgd_private;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_rangelock_exit(zgd->zgd_lr);
+
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_zrele_async(zp);
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
EXPORT_SYMBOL(zfs_access);
EXPORT_SYMBOL(zfs_fsync);
EXPORT_SYMBOL(zfs_holey);