aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2023-05-19 13:05:09 -0700
committerGitHub <[email protected]>2023-05-19 13:05:09 -0700
commit577e835f30c9b92ed8126eb4e8fb17cb0e411c04 (patch)
tree8f9c9d765f6f3a5636cf692999bccca9583daeff
parent482eeef804f0f325faddb102f112c0f1ec86a1b6 (diff)
Probe vdevs before marking removed
Before allowing the ZED to mark a vdev as REMOVED due to a hotplug event confirm that it is non-responsive with probe. Any device which can be successfully probed should be left ONLINE to prevent a healthy pool from being incorrectly SUSPENDED. This may occur for at least the following two scenarios. 1) Drive expansion (zpool online -e) in VMware environments. If, during the partition resize operation, a partition is removed and re-created then udev will send a removed event. 2) Re-scanning the namespaces of an NVMe device (nvme ns-rescan) may result in a udev remove and add event being delivered. Finally, update the ZED to only kick in a spare when the removal was successful. Reviewed-by: Ameer Hamza <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Richard Yao <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Issue #14859 Closes #14861
-rw-r--r--cmd/zed/agents/zfs_retire.c8
-rw-r--r--module/zfs/vdev.c11
2 files changed, 14 insertions, 5 deletions
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index 28714ec29..f83ae0925 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -445,14 +445,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
return;
/* Remove the vdev since device is unplugged */
+ int remove_status = 0;
if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) {
- int status = zpool_vdev_remove_wanted(zhp, devname);
+ remove_status = zpool_vdev_remove_wanted(zhp, devname);
fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'"
- ", ret:%d", devname, status);
+ ", err:%d", devname, libzfs_errno(zhdl));
}
/* Replace the vdev with a spare if its not a l2arc */
- if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
+ if (!l2arc && !remove_status &&
+ (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
replace_with_spare(hdl, zhp, vdev) == B_FALSE)) {
/* Could not handle with spare */
fmd_hdl_debug(hdl, "no spare for '%s'", devname);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 4bfd95861..c243dddb7 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4097,11 +4097,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
/*
- * If the vdev is already removed, then don't do anything.
+ * If the vdev is already removed, or expanding which can trigger
+ * repartition add/remove events, then don't do anything.
*/
- if (vd->vdev_removed)
+ if (vd->vdev_removed || vd->vdev_expanding)
return (spa_vdev_state_exit(spa, NULL, 0));
+ /*
+ * Confirm the vdev has been removed, otherwise don't do anything.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_REMOVE);