diff options
author | Brian Behlendorf <[email protected]> | 2020-08-18 22:13:17 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-08-18 22:13:17 -0700 |
commit | 5266a0728aae503fb2f79961299d4dffac58b22a (patch) | |
tree | d37cedb4ccf2b444d63b497ae1a3a6ffeccdb02c | |
parent | cfd59f904b0760f1bc909bc1b6deae9798042af9 (diff) |
ZED: Do not offline a missing device if no spare is available
Due to commit d48091d a removed device is now explicitly offlined by
the ZED if no spare is available, rather than the letting ZFS detect
it as UNAVAIL. This broke auto-replacing of whole-disk devices, as
described in issue #10577. In short, when a new device is reinserted
in the same slot, the ZED will try to ONLINE it without letting ZFS
recreate the necessary partition table.
This change simply avoids setting the device OFFLINE when removed if
no spare is available (or if spare_on_remove is false). This change
has been left minimal to allow it to be backported to 0.8.x release.
The auto_offline_001_pos ZTS test has been updated accordingly.
Some follow up work is planned to update the ZED so it transitions
the vdev to a REMOVED state. This is a state which has always
existed but there is no current interface the ZED can use to
accomplish this. Therefore it's being left to a follow up PR.
Reviewed-by: Gionatan Danti <[email protected]>
Co-authored-by: Gionatan Danti <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10577
Closes #10730
-rw-r--r-- | cmd/zed/agents/zfs_retire.c | 5 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh | 58 |
2 files changed, 39 insertions, 24 deletions
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index 665fb216d..9e95e20d5 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -351,9 +351,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, zpool_vdev_offline(zhp, devname, B_TRUE); } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || replace_with_spare(hdl, zhp, vdev) == B_FALSE) { - /* Could not handle with spare: offline the device */ - fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); - zpool_vdev_offline(zhp, devname, B_TRUE); + /* Could not handle with spare */ + fmd_hdl_debug(hdl, "no spare for '%s'", devname); } free(devname); diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh index 42c885cbf..1bf54b1a8 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -25,23 +25,29 @@ # # DESCRIPTION: # Testing Fault Management Agent ZED Logic - Physically removed device is -# offlined and onlined when reattached +# made unavail and onlined when reattached # # STRATEGY: # 1. Create a pool # 2. Simulate physical removal of one device -# 3. Verify the device is offlined +# 3. Verify the device is unvailable # 4. Reattach the device # 5. Verify the device is onlined -# 6. Repeat the same tests with a spare device: zed will use the spare to handle -# the removed data device -# 7. Repeat the same tests again with a faulted spare device: zed should offline -# the removed data device if no spare is available +# 6. Repeat the same tests with a spare device: +# zed will use the spare to handle the removed data device +# 7. Repeat the same tests again with a faulted spare device: +# the removed data device should be unavailable # # NOTE: the use of 'block_device_wait' throughout the test helps avoid race # conditions caused by mixing creation/removal events from partitioning the # disk (zpool create) and events from physically removing it (remove_disk). # +# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a +# vdev to the unavailable state. The ZED does receive a removal notification +# but only relies on it to activate a hot spare. Additional work is planned +# to extend an existing ioctl interface to allow the ZED to transition the +# vdev in to a removed state. +# verify_runnable "both" if is_linux; then @@ -76,7 +82,6 @@ removedev=$(get_debug_device) typeset poolconfs=( "mirror $filedev1 $removedev" "raidz3 $filedev1 $filedev2 $filedev3 $removedev" - "$filedev1 cache $removedev" "mirror $filedev1 $filedev2 special mirror $filedev3 $removedev" ) @@ -91,11 +96,16 @@ do log_must zpool create -f $TESTPOOL $conf block_device_wait ${DEV_DSKDIR}/${removedev} + mntpnt=$(get_prop mountpoint /$TESTPOOL) || + log_fail "get_prop mountpoint /$TESTPOOL" + # 2. Simulate physical removal of one device remove_disk $removedev + log_must mkfile 1m $mntpnt/file + log_must zpool sync $TESTPOOL - # 3. Verify the device is offlined - log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + # 3. Verify the device is unvailable. + log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" # 4. Reattach the device insert_disk $removedev @@ -118,21 +128,22 @@ do block_device_wait ${DEV_DSKDIR}/${removedev} log_must zpool add $TESTPOOL spare $sparedev - # 3. Simulate physical removal of one device + mntpnt=$(get_prop mountpoint /$TESTPOOL) || + log_fail "get_prop mountpoint /$TESTPOOL" + + # 2. Simulate physical removal of one device remove_disk $removedev + log_must mkfile 1m $mntpnt/file + log_must zpool sync $TESTPOOL - # 4. Verify the device is handled by the spare unless is a l2arc disk - # which can only be offlined - if [[ $(echo "$conf" | grep -c 'cache') -eq 0 ]]; then - log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE" - else - log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" - fi + # 3. Verify the device is handled by the spare. + log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE" + log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" - # 5. Reattach the device + # 4. Reattach the device insert_disk $removedev - # 6. Verify the device is onlined + # 5. Verify the device is onlined log_must wait_vdev_state $TESTPOOL $removedev "ONLINE" # cleanup @@ -150,15 +161,20 @@ do block_device_wait ${DEV_DSKDIR}/${removedev} log_must zpool add $TESTPOOL spare $sparedev + mntpnt=$(get_prop mountpoint /$TESTPOOL) || + log_fail "get_prop mountpoint /$TESTPOOL" + # 2. Fault the spare device making it unavailable log_must zpool offline -f $TESTPOOL $sparedev log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED" # 3. Simulate physical removal of one device remove_disk $removedev + log_must mkfile 1m $mntpnt/file + log_must zpool sync $TESTPOOL - # 4. Verify the device is offlined - log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + # 4. Verify the device is unavailable + log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" # 5. Reattach the device insert_disk $removedev |