aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2020-08-18 22:13:17 -0700
committerGitHub <[email protected]>2020-08-18 22:13:17 -0700
commit5266a0728aae503fb2f79961299d4dffac58b22a (patch)
treed37cedb4ccf2b444d63b497ae1a3a6ffeccdb02c
parentcfd59f904b0760f1bc909bc1b6deae9798042af9 (diff)
ZED: Do not offline a missing device if no spare is available
Due to commit d48091d a removed device is now explicitly offlined by the ZED if no spare is available, rather than the letting ZFS detect it as UNAVAIL. This broke auto-replacing of whole-disk devices, as described in issue #10577. In short, when a new device is reinserted in the same slot, the ZED will try to ONLINE it without letting ZFS recreate the necessary partition table. This change simply avoids setting the device OFFLINE when removed if no spare is available (or if spare_on_remove is false). This change has been left minimal to allow it to be backported to 0.8.x release. The auto_offline_001_pos ZTS test has been updated accordingly. Some follow up work is planned to update the ZED so it transitions the vdev to a REMOVED state. This is a state which has always existed but there is no current interface the ZED can use to accomplish this. Therefore it's being left to a follow up PR. Reviewed-by: Gionatan Danti <[email protected]> Co-authored-by: Gionatan Danti <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #10577 Closes #10730
-rw-r--r--cmd/zed/agents/zfs_retire.c5
-rwxr-xr-xtests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh58
2 files changed, 39 insertions, 24 deletions
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index 665fb216d..9e95e20d5 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -351,9 +351,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
zpool_vdev_offline(zhp, devname, B_TRUE);
} else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
- /* Could not handle with spare: offline the device */
- fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
- zpool_vdev_offline(zhp, devname, B_TRUE);
+ /* Could not handle with spare */
+ fmd_hdl_debug(hdl, "no spare for '%s'", devname);
}
free(devname);
diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh
index 42c885cbf..1bf54b1a8 100755
--- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh
@@ -25,23 +25,29 @@
#
# DESCRIPTION:
# Testing Fault Management Agent ZED Logic - Physically removed device is
-# offlined and onlined when reattached
+# made unavail and onlined when reattached
#
# STRATEGY:
# 1. Create a pool
# 2. Simulate physical removal of one device
-# 3. Verify the device is offlined
+# 3. Verify the device is unvailable
# 4. Reattach the device
# 5. Verify the device is onlined
-# 6. Repeat the same tests with a spare device: zed will use the spare to handle
-# the removed data device
-# 7. Repeat the same tests again with a faulted spare device: zed should offline
-# the removed data device if no spare is available
+# 6. Repeat the same tests with a spare device:
+# zed will use the spare to handle the removed data device
+# 7. Repeat the same tests again with a faulted spare device:
+# the removed data device should be unavailable
#
# NOTE: the use of 'block_device_wait' throughout the test helps avoid race
# conditions caused by mixing creation/removal events from partitioning the
# disk (zpool create) and events from physically removing it (remove_disk).
#
+# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a
+# vdev to the unavailable state. The ZED does receive a removal notification
+# but only relies on it to activate a hot spare. Additional work is planned
+# to extend an existing ioctl interface to allow the ZED to transition the
+# vdev in to a removed state.
+#
verify_runnable "both"
if is_linux; then
@@ -76,7 +82,6 @@ removedev=$(get_debug_device)
typeset poolconfs=(
"mirror $filedev1 $removedev"
"raidz3 $filedev1 $filedev2 $filedev3 $removedev"
- "$filedev1 cache $removedev"
"mirror $filedev1 $filedev2 special mirror $filedev3 $removedev"
)
@@ -91,11 +96,16 @@ do
log_must zpool create -f $TESTPOOL $conf
block_device_wait ${DEV_DSKDIR}/${removedev}
+ mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
+ log_fail "get_prop mountpoint /$TESTPOOL"
+
# 2. Simulate physical removal of one device
remove_disk $removedev
+ log_must mkfile 1m $mntpnt/file
+ log_must zpool sync $TESTPOOL
- # 3. Verify the device is offlined
- log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
+ # 3. Verify the device is unvailable.
+ log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
# 4. Reattach the device
insert_disk $removedev
@@ -118,21 +128,22 @@ do
block_device_wait ${DEV_DSKDIR}/${removedev}
log_must zpool add $TESTPOOL spare $sparedev
- # 3. Simulate physical removal of one device
+ mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
+ log_fail "get_prop mountpoint /$TESTPOOL"
+
+ # 2. Simulate physical removal of one device
remove_disk $removedev
+ log_must mkfile 1m $mntpnt/file
+ log_must zpool sync $TESTPOOL
- # 4. Verify the device is handled by the spare unless is a l2arc disk
- # which can only be offlined
- if [[ $(echo "$conf" | grep -c 'cache') -eq 0 ]]; then
- log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
- else
- log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
- fi
+ # 3. Verify the device is handled by the spare.
+ log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
+ log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
- # 5. Reattach the device
+ # 4. Reattach the device
insert_disk $removedev
- # 6. Verify the device is onlined
+ # 5. Verify the device is onlined
log_must wait_vdev_state $TESTPOOL $removedev "ONLINE"
# cleanup
@@ -150,15 +161,20 @@ do
block_device_wait ${DEV_DSKDIR}/${removedev}
log_must zpool add $TESTPOOL spare $sparedev
+ mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
+ log_fail "get_prop mountpoint /$TESTPOOL"
+
# 2. Fault the spare device making it unavailable
log_must zpool offline -f $TESTPOOL $sparedev
log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED"
# 3. Simulate physical removal of one device
remove_disk $removedev
+ log_must mkfile 1m $mntpnt/file
+ log_must zpool sync $TESTPOOL
- # 4. Verify the device is offlined
- log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
+ # 4. Verify the device is unavailable
+ log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
# 5. Reattach the device
insert_disk $removedev