diff options
author | Tony Hutter <[email protected]> | 2018-11-08 16:47:24 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-11-08 16:47:24 -0800 |
commit | ad796b8a3b2565bcd9c7460b7bf9154e4850ca93 (patch) | |
tree | 645cc21be6d49c034f00273276caa82ce59702a7 /module/zfs/zfs_fm.c | |
parent | 877d925a9e816337bb62ee61d564118db0181477 (diff) |
Add zpool status -s (slow I/Os) and -p (parseable)
This patch adds a new slow I/Os (-s) column to zpool status to show the
number of VDEV slow I/Os. This is the number of I/Os that didn't
complete in zio_slow_io_ms milliseconds. It also adds a new parsable
(-p) flag to display exact values.
NAME STATE READ WRITE CKSUM SLOW
testpool ONLINE 0 0 0 -
mirror-0 ONLINE 0 0 0 -
loop0 ONLINE 0 0 0 20
loop1 ONLINE 0 0 0 0
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Signed-off-by: Tony Hutter <[email protected]>
Closes #7756
Closes #6885
Diffstat (limited to 'module/zfs/zfs_fm.c')
-rw-r--r-- | module/zfs/zfs_fm.c | 199 |
1 files changed, 116 insertions, 83 deletions
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index e604f33c8..579aa0380 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) return (rc); } -static void +/* + * Return B_TRUE if the event actually posted, B_FALSE if not. + */ +static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) @@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - /* - * If we are doing a spa_tryimport() or in recovery mode, - * ignore errors. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - if (vd != NULL) { - /* - * If the vdev has already been marked as failing due - * to a failed probe, then ignore any subsequent I/O - * errors, as the DE will automatically fault the vdev - * on the first such failure. This also catches cases - * where vdev_remove_wanted is set and the device has - * not yet been asynchronously placed into the REMOVED - * state. - */ - if (zio->io_vd == vd && !vdev_accessible(vd, zio)) - return; - - /* - * Ignore checksum errors for reads from DTL regions of - * leaf vdevs. - */ - if (zio->io_type == ZIO_TYPE_READ && - zio->io_error == ECKSUM && - vd->vdev_ops->vdev_op_leaf && - vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) - return; - } - } - - /* - * For probe failure, we want to avoid posting ereports if we've - * already removed the device in the meantime. - */ - if (vd != NULL && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && - (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) - return; - - if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && - (zio != NULL) && (!zio->io_timestamp)) { - /* Ignore bogus delay events */ - return; - } + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (B_FALSE); if ((ereport = fm_nvlist_create(NULL)) == NULL) - return; + return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); - return; + return (B_FALSE); } /* @@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, - DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); + DATA_TYPE_UINT64, vs->vs_checksum_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, + DATA_TYPE_UINT64, vs->vs_slow_ios, + NULL); } if (pvd != NULL) { @@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, /* * Payload for I/Os with corresponding logical information. */ - if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, @@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); + } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; + return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ @@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } #endif -void +/* + * Make sure our event is still valid for the given zio/vdev/pool. For example, + * we don't want to keep logging events for a faulted or missing vdev. + */ +boolean_t +zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) +{ +#ifdef _KERNEL + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return (B_FALSE); + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return (B_FALSE); + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return (B_FALSE); + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return (B_FALSE); + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return (B_FALSE); + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return (B_FALSE); + + /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + (zio != NULL) && (!zio->io_timestamp)) { + return (B_FALSE); + } +#endif + return (B_TRUE); +} + +/* + * Return 0 if event was posted, EINVAL if there was a problem posting it or + * EBUSY if the event was rate limited. + */ +int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (zfs_is_ratelimiting_event(subclass, vd)) - return; + return (SET_ERROR(EBUSY)); - zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size); + if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size)) + return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) - return; + return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif + return (rc); } void @@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, { zio_cksum_report_t *report; - #ifdef _KERNEL if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return; @@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) } -void +int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; - zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, - spa, vd, zb, zio, offset, length); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return (EBUSY); - if (ereport == NULL) - return; + if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length) || (ereport == NULL)) { + return (SET_ERROR(EINVAL)); + } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #endif + return (rc); } /* @@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #if defined(_KERNEL) EXPORT_SYMBOL(zfs_ereport_post); +EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); |