aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDon Brady <[email protected]>2024-02-08 10:19:52 -0700
committerGitHub <[email protected]>2024-02-08 09:19:52 -0800
commitcbe882298e4ddc3917dfaf239eca475fe06d62d4 (patch)
treeff089153d8a4180df1dca1e7e47fd23584155f77
parent229b9f4ed05e6d14fb4d73fa04a71e99b01bb534 (diff)
Add slow disk diagnosis to ZED
Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Allan Jude <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Co-authored-by: Rob Wing <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #15469
-rw-r--r--cmd/zed/agents/fmd_api.c57
-rw-r--r--cmd/zed/agents/fmd_api.h3
-rw-r--r--cmd/zed/agents/fmd_serd.c3
-rw-r--r--cmd/zed/agents/fmd_serd.h2
-rw-r--r--cmd/zed/agents/zfs_diagnosis.c143
-rw-r--r--cmd/zed/agents/zfs_retire.c3
-rw-r--r--cmd/zinject/zinject.c16
-rw-r--r--cmd/zpool/zpool_main.c8
-rw-r--r--include/sys/fm/fs/zfs.h2
-rw-r--r--include/sys/fs/zfs.h2
-rw-r--r--include/sys/vdev_impl.h5
-rw-r--r--lib/libzfs/libzfs.abi4
-rw-r--r--lib/libzfs/libzfs_pool.c2
-rw-r--r--lib/libzfs/libzfs_util.c4
-rw-r--r--man/man7/vdevprops.74
-rw-r--r--man/man7/zpoolconcepts.74
-rw-r--r--man/man8/zinject.81
-rw-r--r--module/zcommon/zpool_prop.c6
-rw-r--r--module/zfs/vdev.c30
-rw-r--r--module/zfs/zfs_fm.c26
-rw-r--r--module/zfs/zio_inject.c4
-rw-r--r--tests/runfiles/linux.run3
-rw-r--r--tests/zfs-tests/tests/Makefile.am2
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg2
-rwxr-xr-xtests/zfs-tests/tests/functional/events/cleanup.ksh4
-rwxr-xr-xtests/zfs-tests/tests/functional/events/zed_slow_io.ksh205
-rwxr-xr-xtests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh177
-rwxr-xr-xtests/zfs-tests/tests/functional/fault/cleanup.ksh1
-rwxr-xr-xtests/zfs-tests/tests/functional/fault/setup.ksh1
29 files changed, 654 insertions, 70 deletions
diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c
index 4a6cfbf8c..fe43e2ab9 100644
--- a/cmd/zed/agents/fmd_api.c
+++ b/cmd/zed/agents/fmd_api.c
@@ -22,6 +22,7 @@
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
*/
/*
@@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
if (strcmp(name, "spare_on_remove") == 0)
return (1);
- if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
- return (10); /* N = 10 events */
-
- return (0);
-}
-
-int64_t
-fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
-{
- (void) hdl;
-
- /*
- * These can be looked up in mp->modinfo->fmdi_props
- * For now we just hard code for phase 2. In the
- * future, there can be a ZED based override.
- */
- if (strcmp(name, "remove_timeout") == 0)
- return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
-
- if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
- return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
-
return (0);
}
@@ -535,20 +514,31 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
}
-void
-fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+int
+fmd_serd_active(fmd_hdl_t *hdl, const char *name)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
- return;
+ return (0);
}
+ return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
+}
- fmd_serd_eng_reset(sgp);
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
- fmd_hdl_debug(hdl, "serd_reset %s", name);
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+ } else {
+ fmd_serd_eng_reset(sgp);
+ fmd_hdl_debug(hdl, "serd_reset %s", name);
+ }
}
int
@@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
- int err;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
name);
return (0);
}
- err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+ return (fmd_serd_eng_record(sgp, ep->ev_hrt));
+}
+
+void
+fmd_serd_gc(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
- return (err);
+ fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
}
/* FMD Timers */
@@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
struct itimerspec its;
- fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+ fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
/* disarm the timer */
memset(&its, 0, sizeof (struct itimerspec));
diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h
index b940d0d39..8471feecf 100644
--- a/cmd/zed/agents/fmd_api.h
+++ b/cmd/zed/agents/fmd_api.h
@@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
-extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
@@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern int fmd_serd_active(fmd_hdl_t *, const char *);
extern void fmd_serd_reset(fmd_hdl_t *, const char *);
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
extern int fmd_serd_fired(fmd_hdl_t *, const char *);
extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+extern void fmd_serd_gc(fmd_hdl_t *);
extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
extern void fmd_timer_remove(fmd_hdl_t *, id_t);
diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c
index 0bb2c535f..f942e62b3 100644
--- a/cmd/zed/agents/fmd_serd.c
+++ b/cmd/zed/agents/fmd_serd.c
@@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
}
void
-fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
{
+ (void) arg;
fmd_serd_elem_t *sep, *nep;
hrtime_t hrt;
diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h
index 25b6888e6..80ff9a3b2 100644
--- a/cmd/zed/agents/fmd_serd.h
+++ b/cmd/zed/agents/fmd_serd.h
@@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
-extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
#ifdef __cplusplus
}
diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
index f6ba334a3..e0ad00800 100644
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -23,6 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
*/
#include <stddef.h>
@@ -47,11 +48,16 @@
#define DEFAULT_CHECKSUM_T 600 /* seconds */
#define DEFAULT_IO_N 10 /* events */
#define DEFAULT_IO_T 600 /* seconds */
+#define DEFAULT_SLOW_IO_N 10 /* events */
+#define DEFAULT_SLOW_IO_T 30 /* seconds */
+
+#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
/*
- * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
- * #define reserves enough space for two 64-bit hex values plus the length of
- * the longest string.
+ * Our serd engines are named in the following format:
+ * 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
+ * This #define reserves enough space for two 64-bit hex values plus the
+ * length of the longest string.
*/
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
@@ -68,6 +74,7 @@ typedef struct zfs_case_data {
int zc_pool_state;
char zc_serd_checksum[MAX_SERDLEN];
char zc_serd_io[MAX_SERDLEN];
+ char zc_serd_slow_io[MAX_SERDLEN];
int zc_has_remove_timer;
} zfs_case_data_t;
@@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
};
-static hrtime_t zfs_remove_timeout;
+/* wait 15 seconds after a removal */
+static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
uu_list_pool_t *zfs_case_pool;
uu_list_t *zfs_cases;
@@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
#define ZFS_MAKE_EREPORT(type) \
FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
+static void zfs_purge_cases(fmd_hdl_t *hdl);
+
/*
* Write out the persistent representation of an active case.
*/
@@ -171,6 +181,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
}
/*
+ * count other unique slow-io cases in a pool
+ */
+static uint_t
+zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
+{
+ zfs_case_t *zcp;
+ uint_t cases = 0;
+ static hrtime_t next_check = 0;
+
+ /*
+ * Note that plumbing in some external GC would require adding locking,
+ * since most of this module code is not thread safe and assumes there
+ * is only one thread running against the module. So we perform GC here
+ * inline periodically so that future delay induced faults will be
+ * possible once the issue causing multiple vdev delays is resolved.
+ */
+ if (gethrestime_sec() > next_check) {
+ /* Periodically purge old SERD entries and stale cases */
+ fmd_serd_gc(hdl);
+ zfs_purge_cases(hdl);
+ next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
+ }
+
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
+ zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
+ zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+ fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
+ cases++;
+ }
+ }
+ return (cases);
+}
+
+/*
* Iterate over any active cases. If any cases are associated with a pool or
* vdev which is no longer present on the system, close the associated case.
*/
@@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
(long long unsigned int)vdev_guid, type);
}
+static void
+zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+ fmd_hdl_debug(hdl, "retiring case");
+
+ fmd_case_close(hdl, zcp->zc_case);
+}
+
/*
* Solve a given ZFS case. This first checks to make sure the diagnosis is
* still valid, as well as cleaning up any pending timer associated with the
@@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
strcmp(class,
- ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
- strcmp(class,
- ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
zfs_stats.resource_drops.fmds_value.ui64++;
return;
}
@@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (zcp->zc_data.zc_serd_checksum[0] != '\0')
fmd_serd_reset(hdl,
zcp->zc_data.zc_serd_checksum);
+ if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+ fmd_serd_reset(hdl,
+ zcp->zc_data.zc_serd_slow_io);
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
uint64_t state = 0;
@@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_case_solved(hdl, zcp->zc_case))
return;
- fmd_hdl_debug(hdl, "error event '%s'", class);
+ if (vdev_guid)
+ fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
+ vdev_guid);
+ else
+ fmd_hdl_debug(hdl, "error event '%s'", class);
/*
* Determine if we should solve the case and generate a fault. We solve
@@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
+ fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
const char *failmode = NULL;
boolean_t checkremove = B_FALSE;
@@ -815,6 +876,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
checkremove = B_TRUE;
} else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
+ uint64_t slow_io_n, slow_io_t;
+
+ /*
+ * Create a slow io SERD engine when the VDEV has the
+ * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
+ */
+ if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+ &slow_io_n) == 0 &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+ &slow_io_t) == 0) {
+ zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
+ pool_guid, vdev_guid, "slow_io");
+ fmd_serd_create(hdl,
+ zcp->zc_data.zc_serd_slow_io,
+ slow_io_n,
+ SEC2NSEC(slow_io_t));
+ zfs_case_serialize(zcp);
+ }
+ /* Pass event to SERD engine and see if this triggers */
+ if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+ fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
+ ep)) {
+ /*
+ * Ignore a slow io diagnosis when other
+ * VDEVs in the pool show signs of being slow.
+ */
+ if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
+ zfs_case_retire(hdl, zcp);
+ fmd_hdl_debug(hdl, "pool %llu has "
+ "multiple slow io cases -- skip "
+ "degrading vdev %llu",
+ (u_longlong_t)
+ zcp->zc_data.zc_pool_guid,
+ (u_longlong_t)
+ zcp->zc_data.zc_vdev_guid);
+ } else {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.vdev.slow_io");
+ }
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
/*
* We ignore ereports for checksum errors generated by
@@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
if (zcp->zc_data.zc_serd_io[0] != '\0')
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
if (zcp->zc_data.zc_has_remove_timer)
fmd_timer_remove(hdl, zcp->zc_remove_timer);
@@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
}
-/*
- * We use the fmd gc entry point to look for old cases that no longer apply.
- * This allows us to keep our set of case data small in a long running system.
- */
-static void
-zfs_fm_gc(fmd_hdl_t *hdl)
-{
- zfs_purge_cases(hdl);
-}
-
static const fmd_hdl_ops_t fmd_ops = {
zfs_fm_recv, /* fmdo_recv */
zfs_fm_timeout, /* fmdo_timeout */
zfs_fm_close, /* fmdo_close */
NULL, /* fmdo_stats */
- zfs_fm_gc, /* fmdo_gc */
+ NULL, /* fmdo_gc */
};
static const fmd_prop_t fmd_props[] = {
- { "checksum_N", FMD_TYPE_UINT32, "10" },
- { "checksum_T", FMD_TYPE_TIME, "10min" },
- { "io_N", FMD_TYPE_UINT32, "10" },
- { "io_T", FMD_TYPE_TIME, "10min" },
- { "remove_timeout", FMD_TYPE_TIME, "15sec" },
{ NULL, 0, NULL }
};
@@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
-
- zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
}
void
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index a0e377a4a..1ef5c631a 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -524,6 +524,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
"fault.fs.zfs.vdev.checksum")) {
degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
+ "fault.fs.zfs.vdev.slow_io")) {
+ degrade_device = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.device")) {
fault_device = B_FALSE;
} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index f1262ed77..a11b6d0b7 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -1083,6 +1083,22 @@ main(int argc, char **argv)
libzfs_fini(g_zfs);
return (1);
}
+
+ if (record.zi_nlanes) {
+ switch (io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ case ZIO_TYPES:
+ break;
+ default:
+ (void) fprintf(stderr, "I/O type for a delay "
+ "must be 'read' or 'write'\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ }
+
if (!error)
error = ENXIO;
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 8753d7263..0783271f4 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
break;
case VDEV_AUX_ERR_EXCEEDED:
- (void) printf(gettext("too many errors"));
+ if (vs->vs_read_errors + vs->vs_write_errors +
+ vs->vs_checksum_errors == 0 && children == 0 &&
+ vs->vs_slow_ios > 0) {
+ (void) printf(gettext("too many slow I/Os"));
+ } else {
+ (void) printf(gettext("too many errors"));
+ }
break;
case VDEV_AUX_IO_FAILURE:
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index fb9e86492..c746600cd 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -82,6 +82,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c6f7dcca7..025567e21 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -366,6 +366,8 @@ typedef enum {
VDEV_PROP_IO_N,
VDEV_PROP_IO_T,
VDEV_PROP_RAIDZ_EXPANDING,
+ VDEV_PROP_SLOW_IO_N,
+ VDEV_PROP_SLOW_IO_T,
VDEV_NUM_PROPS
} vdev_prop_t;
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index dafab66c7..f39ebf031 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -454,12 +455,14 @@ struct vdev {
zfs_ratelimit_t vdev_checksum_rl;
/*
- * Checksum and IO thresholds for tuning ZED
+ * Vdev properties for tuning ZED
*/
uint64_t vdev_checksum_n;
uint64_t vdev_checksum_t;
uint64_t vdev_io_n;
uint64_t vdev_io_t;
+ uint64_t vdev_slow_io_n;
+ uint64_t vdev_slow_io_t;
};
#define VDEV_PAD_SIZE (8 << 10)
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 7c39b134d..cdd2f04c2 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5626,7 +5626,9 @@
<enumerator name='VDEV_PROP_IO_N' value='44'/>
<enumerator name='VDEV_PROP_IO_T' value='45'/>
<enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
- <enumerator name='VDEV_NUM_PROPS' value='47'/>
+ <enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/>
+ <enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/>
+ <enumerator name='VDEV_NUM_PROPS' value='49'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index c7b8617ef..402c14a6b 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5264,6 +5264,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
case VDEV_PROP_IO_T:
+ case VDEV_PROP_SLOW_IO_N:
+ case VDEV_PROP_SLOW_IO_T:
if (intval == UINT64_MAX) {
(void) strlcpy(buf, "-", len);
} else {
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index d0a63a4da..8e70af2e5 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -1704,7 +1704,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
(prop == VDEV_PROP_CHECKSUM_N ||
prop == VDEV_PROP_CHECKSUM_T ||
prop == VDEV_PROP_IO_N ||
- prop == VDEV_PROP_IO_T)) {
+ prop == VDEV_PROP_IO_T ||
+ prop == VDEV_PROP_SLOW_IO_N ||
+ prop == VDEV_PROP_SLOW_IO_T)) {
*ivalp = UINT64_MAX;
}
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 6eebfa006..3d3ebc072 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -44,7 +44,7 @@ section, below.
Every vdev has a set of properties that export statistics about the vdev
as well as control various behaviors.
Properties are not inherited from top-level vdevs, with the exception of
-checksum_n, checksum_t, io_n, and io_t.
+checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
.Pp
The values of numeric properties can be specified using human-readable suffixes
.Po for example,
@@ -117,7 +117,7 @@ If this device is currently being removed from the pool
.Pp
The following native properties can be used to change the behavior of a vdev.
.Bl -tag -width "allocating"
-.It Sy checksum_n , checksum_t , io_n , io_t
+.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
Tune the fault management daemon by specifying checksum/io thresholds of <N>
errors in <T> seconds, respectively.
These properties can be set on leaf and top-level vdevs.
diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7
index 98f3ee7cd..18dfca6dc 100644
--- a/man/man7/zpoolconcepts.7
+++ b/man/man7/zpoolconcepts.7
@@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning.
The underlying conditions are as follows:
.Bl -bullet -compact
.It
-The number of checksum errors exceeds acceptable levels and the device is
-degraded as an indication that something may be wrong.
+The number of checksum errors or slow I/Os exceeds acceptable levels and the
+device is degraded as an indication that something may be wrong.
ZFS continues to use the device as necessary.
.It
The number of I/O errors exceeds acceptable levels.
diff --git a/man/man8/zinject.8 b/man/man8/zinject.8
index 4f0bbae81..b692f1213 100644
--- a/man/man8/zinject.8
+++ b/man/man8/zinject.8
@@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state.
.Nm zinject
.Fl d Ar vdev
.Fl D Ar latency : Ns Ar lanes
+.Op Fl T Ar read|write
.Ar pool
.Xc
Add an artificial delay to I/O requests on a particular
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index e98063e8b..e2e3bf5be 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -431,6 +431,12 @@ vdev_prop_init(void)
zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE,
sfeatures);
+ zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX,
+ PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE,
+ sfeatures);
+ zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX,
+ PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE,
+ sfeatures);
/* default index (boolean) properties */
zprop_register_index(VDEV_PROP_REMOVING, "removing", 0,
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index d6286dc59..ebba453e2 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -677,6 +677,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+ vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
+ vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
@@ -3755,6 +3757,18 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
+ &vd->vdev_slow_io_n);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
+ &vd->vdev_slow_io_t);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
}
/*
@@ -5970,6 +5984,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_io_t = intval;
break;
+ case VDEV_PROP_SLOW_IO_N:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_slow_io_n = intval;
+ break;
+ case VDEV_PROP_SLOW_IO_T:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_slow_io_t = intval;
+ break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
@@ -6313,6 +6341,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
case VDEV_PROP_IO_T:
+ case VDEV_PROP_SLOW_IO_N:
+ case VDEV_PROP_SLOW_IO_T:
err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT)
break;
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index c4eb74e87..481af2ba8 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
case VDEV_PROP_IO_T:
propval = vd->vdev_io_t;
break;
+ case VDEV_PROP_SLOW_IO_N:
+ propval = vd->vdev_slow_io_n;
+ break;
+ case VDEV_PROP_SLOW_IO_T:
+ propval = vd->vdev_slow_io_t;
+ break;
default:
propval = propdef;
break;
@@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
NULL);
}
+ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+ uint64_t slow_io_n, slow_io_t;
+
+ slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
+ if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+ DATA_TYPE_UINT64,
+ slow_io_n,
+ NULL);
+
+ slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
+ if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+ DATA_TYPE_UINT64,
+ slow_io_t,
+ NULL);
+ }
+
mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport;
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 3598351c4..609182f4a 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
+ if (handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type)
+ continue;
+
/*
* Defensive; should never happen as the array allocation
* occurs prior to inserting this handler on the list.
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 6a4cd3fe6..a0b74ef4a 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -104,7 +104,8 @@ tags = ['functional', 'devices']
[tests/functional/events:Linux]
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
- 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config']
+ 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
+ 'zed_slow_io', 'zed_slow_io_many_vdevs']
tags = ['functional', 'events']
[tests/functional/fadvise:Linux]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 01af258d5..fe9c92108 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/zed_fd_spill.ksh \
functional/events/zed_io_config.ksh \
functional/events/zed_rc_filter.ksh \
+ functional/events/zed_slow_io.ksh \
+ functional/events/zed_slow_io_many_vdevs.ksh \
functional/exec/cleanup.ksh \
functional/exec/exec_001_pos.ksh \
functional/exec/exec_002_neg.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
index 71a64d4fa..c3b9efd64 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
@@ -70,4 +70,6 @@ typeset -a properties=(
checksum_t
io_n
io_t
+ slow_io_n
+ slow_io_t
)
diff --git a/tests/zfs-tests/tests/functional/events/cleanup.ksh b/tests/zfs-tests/tests/functional/events/cleanup.ksh
index ef6e098cf..669b8ae99 100755
--- a/tests/zfs-tests/tests/functional/events/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/events/cleanup.ksh
@@ -26,8 +26,10 @@
. $STF_SUITE/include/libtest.shlib
+zed_stop
+
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
-zed_stop
+zed_events_drain
default_cleanup
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
new file mode 100755
index 000000000..d9fabb2c3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
@@ -0,0 +1,205 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED.
+#
+# STRATEGY:
+# 1. Create a pool with single vdev
+# 2. Set slow_io_n/slow_io_t to non-default values
+# 3. Inject slow io errors
+# 4. Verify that ZED degrades vdev
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_slow_io"
+VDEV="$TEST_BASE_DIR/vdevfile.$$"
+TESTPOOL="slow_io_pool"
+FILEPATH="$TESTDIR/slow_io.testfile"
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
+
+verify_runnable "both"
+
+function do_setup
+{
+ log_must truncate -s 1G $VDEV
+ default_setup_noexit $VDEV
+ zed_events_drain
+ log_must zfs set compression=off $TESTPOOL
+ log_must zfs set primarycache=none $TESTPOOL
+ log_must zfs set prefetch=none $TESTPOOL
+ log_must zfs set recordsize=512 $TESTPOOL
+ for i in {1..10}; do
+ dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null
+ done
+ zpool sync
+}
+
+# intermediate cleanup
+function do_clean
+{
+ log_must zinject -c all
+ log_must zpool destroy $TESTPOOL
+ log_must rm -f $VDEV
+}
+
+# final cleanup
+function cleanup
+{
+ log_must zinject -c all
+
+ # if pool still exists then something failed so log additional info
+ if poolexists $TESTPOOL ; then
+ log_note "$(zpool status -s $TESTPOOL)"
+ echo "=================== zed log search ==================="
+ grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+ destroy_pool $TESTPOOL
+ fi
+ log_must zed_stop
+
+ log_must rm -f $VDEV
+
+ log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+ log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+}
+
+function start_slow_io
+{
+ zpool sync
+ log_must set_tunable64 ZIO_SLOW_IO_MS 10
+ log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
+
+ log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL
+ zpool sync
+}
+
+function stop_slow_io
+{
+ log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+ log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+
+ log_must zinject -c all
+}
+
+# Test default ZED settings:
+# inject 10 events over 2.5 seconds, should not degrade.
+function default_degrade
+{
+ do_setup
+
+ start_slow_io
+ for i in {1..10}; do
+ dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+ sleep 0.25
+ done
+ stop_slow_io
+ log_note "$(zpool status -s $TESTPOOL)"
+
+ # give slow ZED a chance to process the delay events
+ sleep 18
+ log_note "$(zpool status -s $TESTPOOL)"
+
+ degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
+ log_note $degrades vdev degrades in ZED log
+ [ $degrades -eq "0" ] || \
+ log_fail "expecting no degrade events, found $degrades"
+
+ do_clean
+}
+
+# change slow_io_n, slow_io_t to 5 events in 60 seconds
+# fire more than 5 events, should degrade
+function slow_io_degrade
+{
+ do_setup
+
+ zpool set slow_io_n=5 $TESTPOOL $VDEV
+ zpool set slow_io_t=60 $TESTPOOL $VDEV
+
+ start_slow_io
+ for i in {1..16}; do
+ dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+ sleep 0.5
+ done
+ stop_slow_io
+ zpool sync
+
+ #
+ # wait up to 60 seconds for kernel to produce at least 5 delay events
+ #
+ typeset -i i=0
+ typeset -i events=0
+ while [[ $i -lt 60 ]]; do
+ events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
+ [[ $events -ge "5" ]] && break
+ i=$((i+1))
+ sleep 1
+ done
+ log_note "$events delay events found"
+
+ if [[ $events -ge "5" ]]; then
+ log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10
+ fi
+
+ do_clean
+}
+
+# change slow_io_n, slow_io_t to 10 events in 1 second
+# inject events spaced 0.5 seconds apart, should not degrade
+function slow_io_no_degrade
+{
+ do_setup
+
+ zpool set slow_io_n=10 $TESTPOOL $VDEV
+ zpool set slow_io_t=1 $TESTPOOL $VDEV
+
+ start_slow_io
+ for i in {1..16}; do
+ dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+ sleep 0.5
+ done
+ stop_slow_io
+ zpool sync
+
+ log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45
+
+ do_clean
+}
+
+log_assert "Test ZED slow io configurability"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+
+default_degrade
+slow_io_degrade
+slow_io_no_degrade
+
+log_pass "Test ZED slow io configurability"
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
new file mode 100755
index 000000000..3357ae2e3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
@@ -0,0 +1,177 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+# Verify that delay events from multiple vdevs doesnt degrade
+#
+# STRATEGY:
+# 1. Create a pool with a 3 disk raidz vdev
+# 2. Inject slow io errors
+# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_slow_io"
+VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
+VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
+VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
+VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
+VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
+TESTPOOL="slow_io_pool"
+FILEPATH="$TESTDIR/slow_io.testfile"
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
+
+verify_runnable "both"
+
+function cleanup
+{
+ log_must zinject -c all
+
+ # if pool still exists then something failed so log additional info
+ if poolexists $TESTPOOL ; then
+ log_note "$(zpool status -s $TESTPOOL)"
+ echo "=================== zed log search ==================="
+ grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+ destroy_pool $TESTPOOL
+ fi
+ log_must zed_stop
+
+ log_must rm -f $VDEVS
+ log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+ log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+}
+
+function start_slow_io
+{
+ for vdev in $VDEVS
+ do
+ log_must zpool set slow_io_n=4 $TESTPOOL $vdev
+ log_must zpool set slow_io_t=60 $TESTPOOL $vdev
+ done
+ zpool sync
+
+ log_must set_tunable64 ZIO_SLOW_IO_MS 10
+ log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
+
+ for vdev in $VDEVS
+ do
+ log_must zinject -d $vdev -D10:1 $TESTPOOL
+ done
+ zpool sync
+}
+
+function stop_slow_io
+{
+ log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+ log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+
+ log_must zinject -c all
+}
+
+function multiple_slow_vdevs_test
+{
+ log_must truncate -s 1G $VDEVS
+ default_raidz_setup_noexit $VDEVS
+
+ log_must zpool events -c
+ log_must zfs set compression=off $TESTPOOL
+ log_must zfs set primarycache=none $TESTPOOL
+ log_must zfs set recordsize=4K $TESTPOOL
+
+ log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20
+ zpool sync
+
+ #
+ # Read the file with slow io injected on the disks
+ # This will cause multiple errors on each disk to trip ZED SERD
+ #
+ # pool: slow_io_pool
+ # state: ONLINE
+ # config:
+ #
+ # NAME STATE READ WRITE CKSUM SLOW
+ # slow_io_pool ONLINE 0 0 0 -
+ # raidz1-0 ONLINE 0 0 0 -
+ # /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113
+ # /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109
+ # /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96
+ # /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109
+ #
+ start_slow_io
+ dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null
+ stop_slow_io
+
+ # count events available for processing
+ typeset -i i=0
+ typeset -i events=0
+ while [[ $i -lt 60 ]]; do
+ events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
+ [[ $events -ge "50" ]] && break
+ i=$((i+1))
+ sleep 1
+ done
+ log_note "$events delay events found"
+ if [[ $events -lt "50" ]]; then
+ log_note "bailing: not enough events to complete the test"
+ destroy_pool $TESTPOOL
+ return
+ fi
+
+ #
+ # give slow ZED a chance to process the delay events
+ #
+ typeset -i i=0
+ typeset -i skips=0
+ while [[ $i -lt 75 ]]; do
+ skips=$(grep "retiring case" \
+ $ZEDLET_DIR/zed.log | wc -l)
+ [[ $skips -gt "0" ]] && break
+ i=$((i+1))
+ sleep 1
+ done
+
+ log_note $skips degrade skips in ZED log after $i seconds
+ [ $skips -gt "0" ] || log_fail "expecting to see skips"
+
+ degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
+ log_note $degrades vdev degrades in ZED log
+ [ $degrades -eq "0" ] || \
+ log_fail "expecting no degrade events, found $degrades"
+
+ destroy_pool $TESTPOOL
+}
+
+log_assert "Test ZED slow io across multiple vdevs"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+multiple_slow_vdevs_test
+
+log_pass "Test ZED slow io across multiple vdevs"
diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh
index 654343c0c..2959236b5 100755
--- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh
@@ -32,5 +32,6 @@ cleanup_devices $DISKS
zed_stop
zed_cleanup resilver_finish-start-scrub.sh
+zed_events_drain
log_pass
diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh
index 62f1c8ab5..61b9206ec 100755
--- a/tests/zfs-tests/tests/functional/fault/setup.ksh
+++ b/tests/zfs-tests/tests/functional/fault/setup.ksh
@@ -28,6 +28,7 @@
verify_runnable "global"
+zed_events_drain
zed_setup resilver_finish-start-scrub.sh
zed_start