From cbe882298e4ddc3917dfaf239eca475fe06d62d4 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 8 Feb 2024 10:19:52 -0700 Subject: Add slow disk diagnosis to ZED Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Co-authored-by: Rob Wing Signed-off-by: Don Brady Closes #15469 --- module/zfs/zfs_fm.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'module/zfs/zfs_fm.c') diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index c4eb74e87..481af2ba8 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) case VDEV_PROP_IO_T: propval = vd->vdev_io_t; break; + case VDEV_PROP_SLOW_IO_N: + propval = vd->vdev_slow_io_n; + break; + case VDEV_PROP_SLOW_IO_T: + propval = vd->vdev_slow_io_t; + break; default: propval = propdef; break; @@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, NULL); } + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + uint64_t slow_io_n, slow_io_t; + + slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); + if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + DATA_TYPE_UINT64, + slow_io_n, + NULL); + + slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); + if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + DATA_TYPE_UINT64, + slow_io_t, + NULL); + } + mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; -- cgit v1.2.3