aboutsummaryrefslogtreecommitdiffstats
path: root/cmd
diff options
context:
space:
mode:
authorDon Brady <[email protected]>2024-02-08 10:19:52 -0700
committerTony Hutter <[email protected]>2024-04-29 13:50:05 -0700
commitc1c26a77ff38770b80ed1c97aea867d3ad9bf6ee (patch)
tree694326890ff3acb5e4fc2b7122f438a9e02429ce /cmd
parentdb65272aef3d380d2bd1c94907826f2b9ec9205e (diff)
Add slow disk diagnosis to ZED
Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Allan Jude <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Co-authored-by: Rob Wing <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #15469
Diffstat (limited to 'cmd')
-rw-r--r--cmd/zed/agents/fmd_api.c57
-rw-r--r--cmd/zed/agents/fmd_api.h3
-rw-r--r--cmd/zed/agents/fmd_serd.c3
-rw-r--r--cmd/zed/agents/fmd_serd.h2
-rw-r--r--cmd/zed/agents/zfs_diagnosis.c143
-rw-r--r--cmd/zed/agents/zfs_retire.c3
-rw-r--r--cmd/zinject/zinject.c16
-rw-r--r--cmd/zpool/zpool_main.c8
8 files changed, 174 insertions, 61 deletions
diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c
index 4a6cfbf8c..fe43e2ab9 100644
--- a/cmd/zed/agents/fmd_api.c
+++ b/cmd/zed/agents/fmd_api.c
@@ -22,6 +22,7 @@
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
*/
/*
@@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
if (strcmp(name, "spare_on_remove") == 0)
return (1);
- if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
- return (10); /* N = 10 events */
-
- return (0);
-}
-
-int64_t
-fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
-{
- (void) hdl;
-
- /*
- * These can be looked up in mp->modinfo->fmdi_props
- * For now we just hard code for phase 2. In the
- * future, there can be a ZED based override.
- */
- if (strcmp(name, "remove_timeout") == 0)
- return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
-
- if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
- return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
-
return (0);
}
@@ -535,20 +514,31 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
}
-void
-fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+int
+fmd_serd_active(fmd_hdl_t *hdl, const char *name)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
- return;
+ return (0);
}
+ return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
+}
- fmd_serd_eng_reset(sgp);
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
- fmd_hdl_debug(hdl, "serd_reset %s", name);
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+ } else {
+ fmd_serd_eng_reset(sgp);
+ fmd_hdl_debug(hdl, "serd_reset %s", name);
+ }
}
int
@@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
- int err;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
name);
return (0);
}
- err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+ return (fmd_serd_eng_record(sgp, ep->ev_hrt));
+}
+
+void
+fmd_serd_gc(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
- return (err);
+ fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
}
/* FMD Timers */
@@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
struct itimerspec its;
- fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+ fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
/* disarm the timer */
memset(&its, 0, sizeof (struct itimerspec));
diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h
index b940d0d39..8471feecf 100644
--- a/cmd/zed/agents/fmd_api.h
+++ b/cmd/zed/agents/fmd_api.h
@@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
-extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
@@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern int fmd_serd_active(fmd_hdl_t *, const char *);
extern void fmd_serd_reset(fmd_hdl_t *, const char *);
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
extern int fmd_serd_fired(fmd_hdl_t *, const char *);
extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+extern void fmd_serd_gc(fmd_hdl_t *);
extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
extern void fmd_timer_remove(fmd_hdl_t *, id_t);
diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c
index 0bb2c535f..f942e62b3 100644
--- a/cmd/zed/agents/fmd_serd.c
+++ b/cmd/zed/agents/fmd_serd.c
@@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
}
void
-fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
{
+ (void) arg;
fmd_serd_elem_t *sep, *nep;
hrtime_t hrt;
diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h
index 25b6888e6..80ff9a3b2 100644
--- a/cmd/zed/agents/fmd_serd.h
+++ b/cmd/zed/agents/fmd_serd.h
@@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
-extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
#ifdef __cplusplus
}
diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
index f6ba334a3..e0ad00800 100644
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -23,6 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
*/
#include <stddef.h>
@@ -47,11 +48,16 @@
#define DEFAULT_CHECKSUM_T 600 /* seconds */
#define DEFAULT_IO_N 10 /* events */
#define DEFAULT_IO_T 600 /* seconds */
+#define DEFAULT_SLOW_IO_N 10 /* events */
+#define DEFAULT_SLOW_IO_T 30 /* seconds */
+
+#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
/*
- * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
- * #define reserves enough space for two 64-bit hex values plus the length of
- * the longest string.
+ * Our serd engines are named in the following format:
+ * 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
+ * This #define reserves enough space for two 64-bit hex values plus the
+ * length of the longest string.
*/
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
@@ -68,6 +74,7 @@ typedef struct zfs_case_data {
int zc_pool_state;
char zc_serd_checksum[MAX_SERDLEN];
char zc_serd_io[MAX_SERDLEN];
+ char zc_serd_slow_io[MAX_SERDLEN];
int zc_has_remove_timer;
} zfs_case_data_t;
@@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
};
-static hrtime_t zfs_remove_timeout;
+/* wait 15 seconds after a removal */
+static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
uu_list_pool_t *zfs_case_pool;
uu_list_t *zfs_cases;
@@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
#define ZFS_MAKE_EREPORT(type) \
FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
+static void zfs_purge_cases(fmd_hdl_t *hdl);
+
/*
* Write out the persistent representation of an active case.
*/
@@ -171,6 +181,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
}
/*
+ * count other unique slow-io cases in a pool
+ */
+static uint_t
+zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
+{
+ zfs_case_t *zcp;
+ uint_t cases = 0;
+ static hrtime_t next_check = 0;
+
+ /*
+ * Note that plumbing in some external GC would require adding locking,
+ * since most of this module code is not thread safe and assumes there
+ * is only one thread running against the module. So we perform GC here
+ * inline periodically so that future delay induced faults will be
+ * possible once the issue causing multiple vdev delays is resolved.
+ */
+ if (gethrestime_sec() > next_check) {
+ /* Periodically purge old SERD entries and stale cases */
+ fmd_serd_gc(hdl);
+ zfs_purge_cases(hdl);
+ next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
+ }
+
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
+ zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
+ zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+ fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
+ cases++;
+ }
+ }
+ return (cases);
+}
+
+/*
* Iterate over any active cases. If any cases are associated with a pool or
* vdev which is no longer present on the system, close the associated case.
*/
@@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
(long long unsigned int)vdev_guid, type);
}
+static void
+zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+ fmd_hdl_debug(hdl, "retiring case");
+
+ fmd_case_close(hdl, zcp->zc_case);
+}
+
/*
* Solve a given ZFS case. This first checks to make sure the diagnosis is
* still valid, as well as cleaning up any pending timer associated with the
@@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
strcmp(class,
- ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
- strcmp(class,
- ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
zfs_stats.resource_drops.fmds_value.ui64++;
return;
}
@@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (zcp->zc_data.zc_serd_checksum[0] != '\0')
fmd_serd_reset(hdl,
zcp->zc_data.zc_serd_checksum);
+ if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+ fmd_serd_reset(hdl,
+ zcp->zc_data.zc_serd_slow_io);
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
uint64_t state = 0;
@@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_case_solved(hdl, zcp->zc_case))
return;
- fmd_hdl_debug(hdl, "error event '%s'", class);
+ if (vdev_guid)
+ fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
+ vdev_guid);
+ else
+ fmd_hdl_debug(hdl, "error event '%s'", class);
/*
* Determine if we should solve the case and generate a fault. We solve
@@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
+ fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
const char *failmode = NULL;
boolean_t checkremove = B_FALSE;
@@ -815,6 +876,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
checkremove = B_TRUE;
} else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
+ uint64_t slow_io_n, slow_io_t;
+
+ /*
+ * Create a slow io SERD engine when the VDEV has the
+ * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
+ */
+ if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+ &slow_io_n) == 0 &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+ &slow_io_t) == 0) {
+ zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
+ pool_guid, vdev_guid, "slow_io");
+ fmd_serd_create(hdl,
+ zcp->zc_data.zc_serd_slow_io,
+ slow_io_n,
+ SEC2NSEC(slow_io_t));
+ zfs_case_serialize(zcp);
+ }
+ /* Pass event to SERD engine and see if this triggers */
+ if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+ fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
+ ep)) {
+ /*
+ * Ignore a slow io diagnosis when other
+ * VDEVs in the pool show signs of being slow.
+ */
+ if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
+ zfs_case_retire(hdl, zcp);
+ fmd_hdl_debug(hdl, "pool %llu has "
+ "multiple slow io cases -- skip "
+ "degrading vdev %llu",
+ (u_longlong_t)
+ zcp->zc_data.zc_pool_guid,
+ (u_longlong_t)
+ zcp->zc_data.zc_vdev_guid);
+ } else {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.vdev.slow_io");
+ }
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
/*
* We ignore ereports for checksum errors generated by
@@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
if (zcp->zc_data.zc_serd_io[0] != '\0')
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
if (zcp->zc_data.zc_has_remove_timer)
fmd_timer_remove(hdl, zcp->zc_remove_timer);
@@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
}
-/*
- * We use the fmd gc entry point to look for old cases that no longer apply.
- * This allows us to keep our set of case data small in a long running system.
- */
-static void
-zfs_fm_gc(fmd_hdl_t *hdl)
-{
- zfs_purge_cases(hdl);
-}
-
static const fmd_hdl_ops_t fmd_ops = {
zfs_fm_recv, /* fmdo_recv */
zfs_fm_timeout, /* fmdo_timeout */
zfs_fm_close, /* fmdo_close */
NULL, /* fmdo_stats */
- zfs_fm_gc, /* fmdo_gc */
+ NULL, /* fmdo_gc */
};
static const fmd_prop_t fmd_props[] = {
- { "checksum_N", FMD_TYPE_UINT32, "10" },
- { "checksum_T", FMD_TYPE_TIME, "10min" },
- { "io_N", FMD_TYPE_UINT32, "10" },
- { "io_T", FMD_TYPE_TIME, "10min" },
- { "remove_timeout", FMD_TYPE_TIME, "15sec" },
{ NULL, 0, NULL }
};
@@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
-
- zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
}
void
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index a0e377a4a..1ef5c631a 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -524,6 +524,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
"fault.fs.zfs.vdev.checksum")) {
degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
+ "fault.fs.zfs.vdev.slow_io")) {
+ degrade_device = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.device")) {
fault_device = B_FALSE;
} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index f1262ed77..a11b6d0b7 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -1083,6 +1083,22 @@ main(int argc, char **argv)
libzfs_fini(g_zfs);
return (1);
}
+
+ if (record.zi_nlanes) {
+ switch (io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ case ZIO_TYPES:
+ break;
+ default:
+ (void) fprintf(stderr, "I/O type for a delay "
+ "must be 'read' or 'write'\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ }
+
if (!error)
error = ENXIO;
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 5fa06b4bf..a7062615c 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
break;
case VDEV_AUX_ERR_EXCEEDED:
- (void) printf(gettext("too many errors"));
+ if (vs->vs_read_errors + vs->vs_write_errors +
+ vs->vs_checksum_errors == 0 && children == 0 &&
+ vs->vs_slow_ios > 0) {
+ (void) printf(gettext("too many slow I/Os"));
+ } else {
+ (void) printf(gettext("too many errors"));
+ }
break;
case VDEV_AUX_IO_FAILURE: