From 361a7e821178e105c8e1206ead4479de83c2a617 Mon Sep 17 00:00:00 2001 From: Jitendra Patidar <53164267+jsai20@users.noreply.github.com> Date: Wed, 23 Feb 2022 02:36:43 +0530 Subject: log xattr=sa create/remove/update to ZIL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As such, there are no specific synchronous semantics defined for the xattrs. But for xattr=on, it does log to ZIL and zil_commit() is done, if sync=always is set on dataset. This provides sync semantics for xattr=on with sync=always set on dataset. For the xattr=sa implementation, it doesn't log to ZIL, so, even with sync=always, xattrs are not guaranteed to be synced before xattr call returns to caller. So, xattr can be lost if system crash happens, before txg carrying xattr transaction is synced. This change adds xattr=sa logging to ZIL on xattr create/remove/update and xattrs are synced to ZIL (zil_commit() done) for sync=always. This makes xattr=sa behavior similar to xattr=on. Implementation notes: The actual logging is fairly straight-forward and does not warrant additional explanation. However, it has been 14 years since we last added new TX types to the ZIL [1], hence this is the first time we do it after the introduction of zpool features. Therefore, here is an overview of the feature activation and deactivation workflow: 1. The feature must be enabled. Otherwise, we don't log the new record type. This ensures compatibility with older software. 2. The feature is activated per-dataset, since the ZIL is per-dataset. 3. If the feature is enabled and dataset is not for zvol, any append to the ZIL chain will activate the feature for the dataset. Likewise for starting a new ZIL chain. 4. A dataset that doesn't have a ZIL chain has the feature deactivated. We ensure (3) by activating on the first zil_commit() after the feature was enabled. Since activating the features requires waiting for txg sync, the first zil_commit() after enabling the feature will be slower than usual. The downside is that this is really a conservative approximation: even if we never append a 'TX_SETSAXATTR' to the ZIL chain, we pay the penalty for feature activation. The upside is that the user is in control of when we pay the penalty, i.e., upon enabling the feature. We ensure (4) by hooking into zil_sync(), where ZIL destroy actually happens. One more piece on feature activation, since it's spread across multiple functions: zil_commit() zil_process_commit_list() if lwb == NULL // first zil_commit since zil_open zil_create() if no log block pointer in ZIL header: if feature enabled and not active: // CASE 1 enable, COALESCE txg wait with dmu_tx that allocated the log block else // log block was allocated earlier than this zil_open if feature enabled and not active: // CASE 2 enable, EXPLICIT txg wait else // already have an in-DRAM LWB if feature enabled and not active: // this happens when we enable the feature after zil_create // CASE 3 enable, EXPLICIT txg wait [1] https://github.com/illumos/illumos-gate/commit/da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0 Reviewed-by: Matthew Ahrens Reviewed-by: Christian Schwarz Reviewed-by: Ahelenia ZiemiaƄska Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Signed-off-by: Jitendra Patidar Closes #8768 Closes #9078 --- module/os/freebsd/zfs/zfs_vnops_os.c | 6 +-- module/os/linux/zfs/zpl_xattr.c | 2 +- module/zcommon/zfeature_common.c | 12 ++++++ module/zfs/zfs_log.c | 34 +++++++++++++++ module/zfs/zfs_replay.c | 83 ++++++++++++++++++++++++++++++++++++ module/zfs/zfs_sa.c | 29 ++++++++++++- module/zfs/zil.c | 74 ++++++++++++++++++++++++++++++++ 7 files changed, 234 insertions(+), 6 deletions(-) (limited to 'module') diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 1b0f53c6d..21d121a15 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -5549,7 +5549,7 @@ zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname) if (error != 0) error = SET_ERROR(error); else - error = zfs_sa_set_xattr(zp); + error = zfs_sa_set_xattr(zp, attrname, NULL, 0); if (error != 0) { zp->z_xattr_cached = NULL; nvlist_free(nvl); @@ -5706,9 +5706,9 @@ zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname) if (error != 0) error = SET_ERROR(error); } - kmem_free(buf, entry_size); if (error == 0) - error = zfs_sa_set_xattr(zp); + error = zfs_sa_set_xattr(zp, attrname, buf, entry_size); + kmem_free(buf, entry_size); if (error != 0) { zp->z_xattr_cached = NULL; nvlist_free(nvl); diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index ce1815771..3b8ac517a 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -578,7 +578,7 @@ zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, * will be reconstructed from the ARC when next accessed. */ if (error == 0) - error = -zfs_sa_set_xattr(zp); + error = -zfs_sa_set_xattr(zp, name, value, size); if (error) { nvlist_free(nvl); diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 529c52316..13dbccae2 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -695,6 +695,18 @@ zpool_feature_init(void) "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t zilsaxattr_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ZILSAXATTR, + "org.openzfs:zilsaxattr", "zilsaxattr", + "Support for xattr=sa extended attribute logging in ZIL.", + ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, + ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index babf0406e..9df801870 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -720,6 +720,40 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, zil_itx_assign(zilog, itx, tx); } +/* + * Handles TX_SETSAXATTR transactions. + */ +void +zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, const char *name, const void *value, size_t size) +{ + itx_t *itx; + lr_setsaxattr_t *lr; + size_t recsize = sizeof (lr_setsaxattr_t); + void *xattrstart; + int namelen; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + namelen = strlen(name) + 1; + recsize += (namelen + size); + itx = zil_itx_create(txtype, recsize); + lr = (lr_setsaxattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + xattrstart = (char *)(lr + 1); + bcopy(name, xattrstart, namelen); + if (value != NULL) { + bcopy(value, (char *)xattrstart + namelen, size); + lr->lr_size = size; + } else { + lr->lr_size = 0; + } + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + /* * Handles TX_ACL transactions. */ diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 860ca5929..3ccd96dc2 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include /* * NB: FreeBSD expects to be able to do vnode locking in lookup and @@ -868,6 +870,86 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) return (error); } +static int +zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_setsaxattr_t *lr = arg2; + znode_t *zp; + nvlist_t *nvl; + size_t sa_size; + char *name; + char *value; + size_t size; + int error = 0; + + ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, + SPA_FEATURE_ZILSAXATTR)); + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + goto out; + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + /* Get xattr name, value and size from log record */ + size = lr->lr_size; + name = (char *)(lr + 1); + if (size == 0) { + value = NULL; + error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + } else { + value = name + strlen(name) + 1; + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) { + error = SET_ERROR(EFBIG); + goto out; + } + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + goto out; + + if (sa_size > DXATTR_MAX_SA_SIZE) { + error = SET_ERROR(EFBIG); + goto out; + } + + error = nvlist_add_byte_array(nvl, name, (uchar_t *)value, + size); + } + + /* + * Update the SA for additions, modifications, and removals. On + * error drop the inconsistent cached version of the nvlist, it + * will be reconstructed from the ARC when next accessed. + */ + if (error == 0) + error = zfs_sa_set_xattr(zp, name, value, size); + + if (error) { + nvlist_free(nvl); + zp->z_xattr_cached = NULL; + } + +out: + rw_exit(&zp->z_xattr_lock); + zrele(zp); + return (error); +} + static int zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) { @@ -989,4 +1071,5 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ + zfs_replay_setsaxattr, /* TX_SETSAXATTR */ }; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 817f63048..1f15cae00 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * ZPL attribute registration table. @@ -69,7 +70,10 @@ const sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {NULL, 0, 0, 0} }; + #ifdef _KERNEL +static int zfs_zil_saxattr = 1; + int zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio) { @@ -219,13 +223,14 @@ zfs_sa_get_xattr(znode_t *zp) } int -zfs_sa_set_xattr(znode_t *zp) +zfs_sa_set_xattr(znode_t *zp, const char *name, const void *value, size_t vsize) { zfsvfs_t *zfsvfs = ZTOZSB(zp); + zilog_t *zilog; dmu_tx_t *tx; char *obj; size_t size; - int error; + int error, logsaxattr = 0; ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT(zp->z_xattr_cached); @@ -244,6 +249,17 @@ zfs_sa_set_xattr(znode_t *zp) if (error) goto out_free; + zilog = zfsvfs->z_log; + + /* + * Users enable ZIL logging of xattr=sa operations by enabling the + * SPA_FEATURE_ZILSAXATTR feature on the pool. Feature is activated + * during zil_process_commit_list/zil_create, if enabled. + */ + if (spa_feature_is_enabled(zfsvfs->z_os->os_spa, + SPA_FEATURE_ZILSAXATTR) && zfs_zil_saxattr) + logsaxattr = 1; + tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa_create(tx, size); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); @@ -256,6 +272,10 @@ zfs_sa_set_xattr(znode_t *zp) sa_bulk_attr_t bulk[2]; uint64_t ctime[2]; + if (logsaxattr) + zfs_log_setsaxattr(zilog, tx, TX_SETSAXATTR, zp, name, + value, vsize); + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs), NULL, obj, size); @@ -264,6 +284,8 @@ zfs_sa_set_xattr(znode_t *zp) VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); + if (logsaxattr && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); } out_free: vmem_free(obj, size); @@ -433,6 +455,9 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) } } +ZFS_MODULE_PARAM(zfs, zfs_, zil_saxattr, INT, ZMOD_RW, + "Disable xattr=sa extended attribute logging in ZIL by settng 0."); + EXPORT_SYMBOL(zfs_attr_table); EXPORT_SYMBOL(zfs_sa_readlink); EXPORT_SYMBOL(zfs_sa_symlink); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 87a50a5c4..10f89c916 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -663,6 +663,38 @@ zilog_is_dirty(zilog_t *zilog) return (B_FALSE); } +/* + * Its called in zil_commit context (zil_process_commit_list()/zil_create()). + * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled. + * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every + * zil_commit. + */ +static void +zil_commit_activate_saxattr_feature(zilog_t *zilog) +{ + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + uint64_t txg = 0; + dmu_tx_t *tx = NULL; + + if (spa_feature_is_enabled(zilog->zl_spa, + SPA_FEATURE_ZILSAXATTR) && + dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && + !dsl_dataset_feature_is_active(ds, + SPA_FEATURE_ZILSAXATTR)) { + tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_dataset_dirty(ds, tx); + txg = dmu_tx_get_txg(tx); + + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } +} + /* * Create an on-disk intent log. */ @@ -677,6 +709,8 @@ zil_create(zilog_t *zilog) int error = 0; boolean_t fastwrite = FALSE; boolean_t slog = FALSE; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + /* * Wait for any previous destroy to complete. @@ -724,9 +758,33 @@ zil_create(zilog_t *zilog) * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { + /* + * If "zilsaxattr" feature is enabled on zpool, then activate + * it now when we're creating the ZIL chain. We can't wait with + * this until we write the first xattr log record because we + * need to wait for the feature activation to sync out. + */ + if (spa_feature_is_enabled(zilog->zl_spa, + SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != + DMU_OST_ZVOL) { + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); + } else { + /* + * This branch covers the case where we enable the feature on a + * zpool that has existing ZIL headers. + */ + zil_commit_activate_saxattr_feature(zilog); } + IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && + dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL, + dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)); ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); IMPLY(error == 0, lwb != NULL); @@ -2297,6 +2355,11 @@ zil_process_commit_list(zilog_t *zilog) if (lwb == NULL) { lwb = zil_create(zilog); } else { + /* + * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will + * have already been created (zl_lwb_list not empty). + */ + zil_commit_activate_saxattr_feature(zilog); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); @@ -3075,6 +3138,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(list_head(&zilog->zl_lwb_list) == NULL); @@ -3092,6 +3156,16 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; + } else { + /* + * A destroyed ZIL chain can't contain any TX_SETSAXATTR + * records. So, deactivate the feature for this dataset. + * We activate it again when we start a new ZIL chain. + */ + if (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_ZILSAXATTR)) + dsl_dataset_deactivate_feature(ds, + SPA_FEATURE_ZILSAXATTR, tx); } } -- cgit v1.2.3