summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChunwei Chen <[email protected]>2019-08-28 10:42:02 -0700
committerTony Hutter <[email protected]>2019-09-25 11:27:51 -0700
commitc7a4255f128cc493df8383cb9f1ed650191b2081 (patch)
tree225b6c71819d62a50639edbdaf8c68d1dbfa11b0
parent931bef81c8a4bda13e22be770c1dca3721dffc0f (diff)
Fix zil replay panic when TX_REMOVE followed by TX_CREATE
If TX_REMOVE is followed by TX_CREATE on the same object id, we need to make sure the object removal is completely finished before creation. The current implementation relies on dnode_hold_impl with DNODE_MUST_BE_ALLOCATED returning ENOENT. While this check seems to work fine before, in current version it does not guarantee the object removal is completed. We fix this by checking if DNODE_MUST_BE_FREE returns successful instead. Also add test and remove dead code in dnode_hold_impl. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Tom Caputi <[email protected]> Signed-off-by: Chunwei Chen <[email protected]> Closes #7151 Closes #8910 Closes #9123 Closes #9145
-rw-r--r--include/sys/dnode.h7
-rw-r--r--module/zfs/dnode.c49
-rw-r--r--module/zfs/zfs_replay.c8
-rw-r--r--tests/runfiles/linux.run4
-rw-r--r--tests/zfs-tests/tests/functional/slog/Makefile.am3
-rwxr-xr-xtests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh (renamed from tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh)0
-rwxr-xr-xtests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh137
7 files changed, 184 insertions, 24 deletions
diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index c60258bbc..e97e40373 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -46,6 +46,7 @@ extern "C" {
*/
#define DNODE_MUST_BE_ALLOCATED 1
#define DNODE_MUST_BE_FREE 2
+#define DNODE_DRY_RUN 4
/*
* dnode_next_offset() flags.
@@ -415,6 +416,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
+int dnode_try_claim(objset_t *os, uint64_t object, int slots);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
@@ -533,11 +535,6 @@ typedef struct dnode_stats {
*/
kstat_named_t dnode_hold_free_overflow;
/*
- * Number of times a dnode_hold(...) was attempted on a dnode
- * which had already been unlinked in an earlier txg.
- */
- kstat_named_t dnode_hold_free_txg;
- /*
* Number of times dnode_free_interior_slots() needed to retry
* acquiring a slot zrl lock due to contention.
*/
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 5fd473303..cc7bc5ec8 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -55,7 +55,6 @@ dnode_stats_t dnode_stats = {
{ "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
{ "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
{ "dnode_allocate", KSTAT_DATA_UINT64 },
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
@@ -1255,6 +1254,10 @@ dnode_buf_evict_async(void *dbu)
* as an extra dnode slot by an large dnode, in which case it returns
* ENOENT.
*
+ * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
+ * return whether the hold would succeed or not. tag and dnp should set to
+ * NULL in this case.
+ *
* errors:
* EINVAL - Invalid object number or flags.
* ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
@@ -1283,6 +1286,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+ IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
/*
* If you are holding the spa config lock as writer, you shouldn't
@@ -1312,8 +1316,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
return (SET_ERROR(EEXIST));
DNODE_VERIFY(dn);
- (void) zfs_refcount_add(&dn->dn_holds, tag);
- *dnp = dn;
+ /* Don't actually hold if dry run, just return 0 */
+ if (!(flag & DNODE_DRY_RUN)) {
+ (void) zfs_refcount_add(&dn->dn_holds, tag);
+ *dnp = dn;
+ }
return (0);
}
@@ -1455,6 +1462,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
return (SET_ERROR(ENOENT));
}
+ /* Don't actually hold if dry run, just return 0 */
+ if (flag & DNODE_DRY_RUN) {
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (0);
+ }
+
DNODE_STAT_BUMP(dnode_hold_alloc_hits);
} else if (flag & DNODE_MUST_BE_FREE) {
@@ -1512,6 +1527,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
return (SET_ERROR(EEXIST));
}
+ /* Don't actually hold if dry run, just return 0 */
+ if (flag & DNODE_DRY_RUN) {
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (0);
+ }
+
dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
DNODE_STAT_BUMP(dnode_hold_free_hits);
} else {
@@ -1519,15 +1542,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
return (SET_ERROR(EINVAL));
}
- if (dn->dn_free_txg) {
- DNODE_STAT_BUMP(dnode_hold_free_txg);
- type = dn->dn_type;
- mutex_exit(&dn->dn_mtx);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
- ENOENT : EEXIST));
- }
+ ASSERT0(dn->dn_free_txg);
if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
@@ -1618,6 +1633,16 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
}
}
+/*
+ * Test whether we can create a dnode at the specified location.
+ */
+int
+dnode_try_claim(objset_t *os, uint64_t object, int slots)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
+ slots, NULL, NULL));
+}
+
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index 144381769..7dea85bb6 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -337,8 +337,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
xva.xva_vattr.va_nblocks = lr->lr_gen;
xva.xva_vattr.va_fsid = dnodesize;
- error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
- if (error != ENOENT)
+ error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+ if (error)
goto bail;
if (lr->lr_common.lrc_txtype & TX_CI)
@@ -473,8 +473,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
xva.xva_vattr.va_nblocks = lr->lr_gen;
xva.xva_vattr.va_fsid = dnodesize;
- error = dmu_object_info(zfsvfs->z_os, objid, NULL);
- if (error != ENOENT)
+ error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+ if (error)
goto out;
if (lr->lr_common.lrc_txtype & TX_CI)
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 1c368d20c..0e157cf0e 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -824,8 +824,8 @@ tags = ['functional', 'scrub_mirror']
tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
- 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs',
- 'slog_replay_volume']
+ 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
+ 'slog_replay_fs_002', 'slog_replay_volume']
tags = ['functional', 'slog']
[tests/functional/snapshot]
diff --git a/tests/zfs-tests/tests/functional/slog/Makefile.am b/tests/zfs-tests/tests/functional/slog/Makefile.am
index 4548ce63b..33e3a6d3a 100644
--- a/tests/zfs-tests/tests/functional/slog/Makefile.am
+++ b/tests/zfs-tests/tests/functional/slog/Makefile.am
@@ -17,7 +17,8 @@ dist_pkgdata_SCRIPTS = \
slog_013_pos.ksh \
slog_014_pos.ksh \
slog_015_neg.ksh \
- slog_replay_fs.ksh \
+ slog_replay_fs_001.ksh \
+ slog_replay_fs_002.ksh \
slog_replay_volume.ksh
dist_pkgdata_DATA = \
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh
index 3e5bccd2e..3e5bccd2e 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh
new file mode 100755
index 000000000..3c3ccdf4a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh
@@ -0,0 +1,137 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/tests/functional/slog/slog.kshlib
+
+#
+# DESCRIPTION:
+# Verify slog replay correctly when TX_REMOVEs are followed by
+# TX_CREATEs.
+#
+# STRATEGY:
+# 1. Create a file system (TESTFS) with a lot of files
+# 2. Freeze TESTFS
+# 3. Remove all files then create a lot of files
+# 4. Copy TESTFS to temporary location (TESTDIR/copy)
+# 5. Unmount filesystem
+# <at this stage TESTFS is empty again and unfrozen, and the
+# intent log contains a complete set of deltas to replay it>
+# 6. Remount TESTFS <which replays the intent log>
+# 7. Compare TESTFS against the TESTDIR/copy
+#
+
+verify_runnable "global"
+
+function cleanup_fs
+{
+ cleanup
+}
+
+log_assert "Replay of intent log succeeds."
+log_onexit cleanup_fs
+log_must setup
+
+#
+# 1. Create a file system (TESTFS) with a lot of files
+#
+log_must zpool create $TESTPOOL $VDEV log mirror $LDEV
+log_must zfs set compression=on $TESTPOOL
+log_must zfs create $TESTPOOL/$TESTFS
+
+# Prep for the test of TX_REMOVE followed by TX_CREATE
+dnsize=(legacy auto 1k 2k 4k 8k 16k)
+NFILES=200
+log_must mkdir /$TESTPOOL/$TESTFS/dir0
+log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done'
+
+#
+# Reimport to reset dnode allocation pointer.
+# This is to make sure we will have TX_REMOVE and TX_CREATE on same id
+#
+log_must zpool export $TESTPOOL
+log_must zpool import -f -d $VDIR $TESTPOOL
+
+#
+# This dd command works around an issue where ZIL records aren't created
+# after freezing the pool unless a ZIL header already exists. Create a file
+# synchronously to force ZFS to write one out.
+#
+log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \
+ conv=fdatasync,fsync bs=1 count=1
+
+#
+# 2. Freeze TESTFS
+#
+log_must zpool freeze $TESTPOOL
+
+#
+# 3. Remove all files then create a lot of files
+#
+# TX_REMOVE followed by TX_CREATE
+log_must eval 'rm -f /$TESTPOOL/$TESTFS/dir0/*'
+log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done'
+
+#
+# 4. Copy TESTFS to temporary location (TESTDIR/copy)
+#
+log_must mkdir -p $TESTDIR/copy
+log_must cp -a /$TESTPOOL/$TESTFS/* $TESTDIR/copy/
+
+#
+# 5. Unmount filesystem and export the pool
+#
+# At this stage TESTFS is empty again and frozen, the intent log contains
+# a complete set of deltas to replay.
+#
+log_must zfs unmount /$TESTPOOL/$TESTFS
+
+log_note "Verify transactions to replay:"
+log_must zdb -iv $TESTPOOL/$TESTFS
+
+log_must zpool export $TESTPOOL
+
+#
+# 6. Remount TESTFS <which replays the intent log>
+#
+# Import the pool to unfreeze it and claim log blocks. It has to be
+# `zpool import -f` because we can't write a frozen pool's labels!
+#
+log_must zpool import -f -d $VDIR $TESTPOOL
+
+#
+# 7. Compare TESTFS against the TESTDIR/copy
+#
+log_note "Verify current block usage:"
+log_must zdb -bcv $TESTPOOL
+
+log_note "Verify number of files"
+log_must test "$(ls /$TESTPOOL/$TESTFS/dir0 | wc -l)" -eq $NFILES
+
+log_note "Verify working set diff:"
+log_must diff -r /$TESTPOOL/$TESTFS $TESTDIR/copy
+
+log_pass "Replay of intent log succeeds."