diff options
author | Brian Behlendorf <[email protected]> | 2023-05-19 13:05:09 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2023-05-19 13:05:09 -0700 |
commit | 577e835f30c9b92ed8126eb4e8fb17cb0e411c04 (patch) | |
tree | 8f9c9d765f6f3a5636cf692999bccca9583daeff | |
parent | 482eeef804f0f325faddb102f112c0f1ec86a1b6 (diff) |
Probe vdevs before marking removed
Before allowing the ZED to mark a vdev as REMOVED due to a
hotplug event confirm that it is non-responsive with probe.
Any device which can be successfully probed should be left
ONLINE to prevent a healthy pool from being incorrectly
SUSPENDED. This may occur for at least the following two
scenarios.
1) Drive expansion (zpool online -e) in VMware environments.
If, during the partition resize operation, a partition is
removed and re-created then udev will send a removed event.
2) Re-scanning the namespaces of an NVMe device (nvme ns-rescan)
may result in a udev remove and add event being delivered.
Finally, update the ZED to only kick in a spare when the
removal was successful.
Reviewed-by: Ameer Hamza <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Richard Yao <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Issue #14859
Closes #14861
-rw-r--r-- | cmd/zed/agents/zfs_retire.c | 8 | ||||
-rw-r--r-- | module/zfs/vdev.c | 11 |
2 files changed, 14 insertions, 5 deletions
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index 28714ec29..f83ae0925 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -445,14 +445,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, return; /* Remove the vdev since device is unplugged */ + int remove_status = 0; if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) { - int status = zpool_vdev_remove_wanted(zhp, devname); + remove_status = zpool_vdev_remove_wanted(zhp, devname); fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'" - ", ret:%d", devname, status); + ", err:%d", devname, libzfs_errno(zhdl)); } /* Replace the vdev with a spare if its not a l2arc */ - if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") || + if (!l2arc && !remove_status && + (!fmd_prop_get_int32(hdl, "spare_on_remove") || replace_with_spare(hdl, zhp, vdev) == B_FALSE)) { /* Could not handle with spare */ fmd_hdl_debug(hdl, "no spare for '%s'", devname); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4bfd95861..c243dddb7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4097,11 +4097,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* - * If the vdev is already removed, then don't do anything. + * If the vdev is already removed, or expanding which can trigger + * repartition add/remove events, then don't do anything. */ - if (vd->vdev_removed) + if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); + /* + * Confirm the vdev has been removed, otherwise don't do anything. + */ + if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); + vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); |