diff options
author | Tony Hutter <[email protected]> | 2018-11-08 16:47:24 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-11-08 16:47:24 -0800 |
commit | ad796b8a3b2565bcd9c7460b7bf9154e4850ca93 (patch) | |
tree | 645cc21be6d49c034f00273276caa82ce59702a7 /module/zfs | |
parent | 877d925a9e816337bb62ee61d564118db0181477 (diff) |
Add zpool status -s (slow I/Os) and -p (parseable)
This patch adds a new slow I/Os (-s) column to zpool status to show the
number of VDEV slow I/Os. This is the number of I/Os that didn't
complete in zio_slow_io_ms milliseconds. It also adds a new parsable
(-p) flag to display exact values.
NAME STATE READ WRITE CKSUM SLOW
testpool ONLINE 0 0 0 -
mirror-0 ONLINE 0 0 0 -
loop0 ONLINE 0 0 0 20
loop1 ONLINE 0 0 0 0
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Signed-off-by: Tony Hutter <[email protected]>
Closes #7756
Closes #6885
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/vdev.c | 23 | ||||
-rw-r--r-- | module/zfs/vdev_label.c | 3 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 199 | ||||
-rw-r--r-- | module/zfs/zio.c | 34 |
4 files changed, 159 insertions, 100 deletions
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 78e701c80..a99eb93a4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE; int vdev_dtl_sm_blksz = (1 << 12); /* - * Rate limit delay events to this many IO delays per second. + * Rate limit slow IO (delay) events to this many per second. */ -unsigned int zfs_delays_per_second = 20; +unsigned int zfs_slow_io_events_per_second = 20; /* * Rate limit checksum events after this many checksum errors per second. */ -unsigned int zfs_checksums_per_second = 20; +unsigned int zfs_checksum_events_per_second = 20; /* * Ignore errors during scrub/resilver. Allows to work around resilver @@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ - zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1); - zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1); + zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, + 1); + zfs_ratelimit_init(&vd->vdev_checksum_rl, + &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); @@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644); MODULE_PARM_DESC(vdev_ms_count_limit, "Practical upper limit of total metaslabs per top-level vdev"); -module_param(zfs_delays_per_second, uint, 0644); -MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many " - "IO delays per second"); +module_param(zfs_slow_io_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_slow_io_events_per_second, + "Rate limit slow IO (delay) events to this many per second"); -module_param(zfs_checksums_per_second, uint, 0644); - MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events " +module_param(zfs_checksum_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events " "to this many checksum errors per second (do not set below zed" "threshold)."); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7e86e3a8b..b3425cf26 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); + /* IO delays */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index e604f33c8..579aa0380 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) return (rc); } -static void +/* + * Return B_TRUE if the event actually posted, B_FALSE if not. + */ +static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) @@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - /* - * If we are doing a spa_tryimport() or in recovery mode, - * ignore errors. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - if (vd != NULL) { - /* - * If the vdev has already been marked as failing due - * to a failed probe, then ignore any subsequent I/O - * errors, as the DE will automatically fault the vdev - * on the first such failure. This also catches cases - * where vdev_remove_wanted is set and the device has - * not yet been asynchronously placed into the REMOVED - * state. - */ - if (zio->io_vd == vd && !vdev_accessible(vd, zio)) - return; - - /* - * Ignore checksum errors for reads from DTL regions of - * leaf vdevs. - */ - if (zio->io_type == ZIO_TYPE_READ && - zio->io_error == ECKSUM && - vd->vdev_ops->vdev_op_leaf && - vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) - return; - } - } - - /* - * For probe failure, we want to avoid posting ereports if we've - * already removed the device in the meantime. - */ - if (vd != NULL && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && - (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) - return; - - if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && - (zio != NULL) && (!zio->io_timestamp)) { - /* Ignore bogus delay events */ - return; - } + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (B_FALSE); if ((ereport = fm_nvlist_create(NULL)) == NULL) - return; + return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); - return; + return (B_FALSE); } /* @@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, - DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); + DATA_TYPE_UINT64, vs->vs_checksum_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, + DATA_TYPE_UINT64, vs->vs_slow_ios, + NULL); } if (pvd != NULL) { @@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, /* * Payload for I/Os with corresponding logical information. */ - if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, @@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); + } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; + return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ @@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } #endif -void +/* + * Make sure our event is still valid for the given zio/vdev/pool. For example, + * we don't want to keep logging events for a faulted or missing vdev. + */ +boolean_t +zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) +{ +#ifdef _KERNEL + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return (B_FALSE); + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return (B_FALSE); + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return (B_FALSE); + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return (B_FALSE); + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return (B_FALSE); + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return (B_FALSE); + + /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + (zio != NULL) && (!zio->io_timestamp)) { + return (B_FALSE); + } +#endif + return (B_TRUE); +} + +/* + * Return 0 if event was posted, EINVAL if there was a problem posting it or + * EBUSY if the event was rate limited. + */ +int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (zfs_is_ratelimiting_event(subclass, vd)) - return; + return (SET_ERROR(EBUSY)); - zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size); + if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size)) + return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) - return; + return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif + return (rc); } void @@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, { zio_cksum_report_t *report; - #ifdef _KERNEL if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return; @@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) } -void +int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; - zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, - spa, vd, zb, zio, offset, length); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return (EBUSY); - if (ereport == NULL) - return; + if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length) || (ereport == NULL)) { + return (SET_ERROR(EINVAL)); + } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #endif + return (rc); } /* @@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #if defined(_KERNEL) EXPORT_SYMBOL(zfs_ereport_post); +EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e8c2ca89a..6f1aa640d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -77,7 +77,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif -int zio_delay_max = ZIO_DELAY_MAX; +/* Mark IOs as "slow" if they take longer than 30 seconds */ +int zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) @@ -4431,10 +4432,28 @@ zio_done(zio_t *zio) * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ - if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { - if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); + if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { + /* + * We want to only increment our slow IO counters if + * the IO is valid (i.e. not if the drive is removed). + * + * zfs_ereport_post() will also do these checks, but + * it can also ratelimit and have other failures, so we + * need to increment the slow_io counters independent + * of it. + */ + if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_slow_ios++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0, 0); + } + } } if (zio->io_error) { @@ -4823,8 +4842,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); -module_param(zio_delay_max, int, 0644); -MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); +module_param(zio_slow_io_ms, int, 0644); +MODULE_PARM_DESC(zio_slow_io_ms, + "Max I/O completion time (milliseconds) before marking it as slow"); module_param(zio_requeue_io_start_cut_in_line, int, 0644); MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); |