diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/zcommon/zfs_comutil.c | 64 | ||||
-rw-r--r-- | module/zfs/fm.c | 22 | ||||
-rw-r--r-- | module/zfs/vdev.c | 29 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 46 |
4 files changed, 151 insertions, 10 deletions
diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c index 6d0314fa7..704ef84c7 100644 --- a/module/zcommon/zfs_comutil.c +++ b/module/zcommon/zfs_comutil.c @@ -40,6 +40,7 @@ #include <sys/int_limits.h> #include <sys/nvpair.h> #include "zfs_comutil.h" +#include <sys/zfs_ratelimit.h> /* * Are there allocatable vdevs? @@ -206,10 +207,73 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = { "pool split", }; +/* + * Initialize rate limit struct + * + * rl: zfs_ratelimit_t struct + * burst: Number to allow in an interval before rate limiting + * interval: Interval time in seconds + */ +void +zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int burst, + unsigned int interval) +{ + rl->count = 0; + rl->start = 0; + rl->interval = interval; + rl->burst = burst; + mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL); +} + +/* + * Re-implementation of the kernel's __ratelimit() function + * + * We had to write our own rate limiter because the kernel's __ratelimit() + * function annoyingly prints out how many times it rate limited to the kernel + * logs (and there's no way to turn it off): + * + * __ratelimit: 59 callbacks suppressed + * + * If the kernel ever allows us to disable these prints, we should go back to + * using __ratelimit() instead. + * + * Return values are the same as __ratelimit(): + * + * 0: If we're rate limiting + * 1: If we're not rate limiting. + */ +int +zfs_ratelimit(zfs_ratelimit_t *rl) +{ + hrtime_t now; + hrtime_t elapsed; + int rc = 1; + + mutex_enter(&rl->lock); + + now = gethrtime(); + elapsed = now - rl->start; + + rl->count++; + if (NSEC2SEC(elapsed) >= rl->interval) { + rl->start = now; + rl->count = 0; + } else { + if (rl->count >= rl->burst) { + rc = 0; /* We're ratelimiting */ + } + } + mutex_exit(&rl->lock); + + return (rc); +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(zfs_allocatable_devs); EXPORT_SYMBOL(zpool_get_rewind_policy); EXPORT_SYMBOL(zfs_zpl_version_map); EXPORT_SYMBOL(zfs_spa_version_map); EXPORT_SYMBOL(zfs_history_event_names); +EXPORT_SYMBOL(zfs_ratelimit_init); +EXPORT_SYMBOL(zfs_ratelimit); #endif diff --git a/module/zfs/fm.c b/module/zfs/fm.c index a1069d140..6c569ffc4 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -84,6 +84,9 @@ static int zevent_len_cur = 0; static int zevent_waiters = 0; static int zevent_flags = 0; +/* Num events rate limited since the last time zfs_zevent_next() was called */ +static uint64_t ratelimit_dropped = 0; + /* * The EID (Event IDentifier) is used to uniquely tag a zevent when it is * posted. The posted EIDs are monotonically increasing but not persistent. @@ -654,6 +657,12 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, list_insert_head(&ev->ev_ze_list, ze); (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP); *dropped = ze->ze_dropped; + +#ifdef _KERNEL + /* Include events dropped due to rate limiting */ + *dropped += ratelimit_dropped; + ratelimit_dropped = 0; +#endif ze->ze_dropped = 0; out: mutex_exit(&zevent_lock); @@ -1587,6 +1596,19 @@ fm_ena_time_get(uint64_t ena) } #ifdef _KERNEL +/* + * Helper function to increment ereport dropped count. Used by the event + * rate limiting code to give feedback to the user about how many events were + * rate limited by including them in the 'dropped' count. + */ +void +fm_erpt_dropped_increment(void) +{ + atomic_inc_64(&ratelimit_dropped); +} +#endif + +#ifdef _KERNEL void fm_init(void) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5ff5cf3b1..f7e91430f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -44,6 +44,7 @@ #include <sys/zil.h> #include <sys/dsl_scan.h> #include <sys/zvol.h> +#include <sys/zfs_ratelimit.h> /* * When a vdev is added, it will be divided into approximately (but no @@ -346,12 +347,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + /* + * Initialize rate limit structs for events. We rate limit ZIO delay + * and checksum events so that we don't overwhelm ZED with thousands + * of events when a disk is acting up. + */ + zfs_ratelimit_init(&vd->vdev_delay_rl, DELAYS_PER_SECOND, 1); + zfs_ratelimit_init(&vd->vdev_checksum_rl, CHECKSUMS_PER_SECOND, 1); + list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); + for (t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); @@ -2221,7 +2231,6 @@ vdev_load(vdev_t *vd) vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); - /* * If this is a leaf vdev, load its DTL. */ @@ -3458,15 +3467,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) /* * Notify ZED of any significant state-change on a leaf vdev. * - * We ignore transitions from a closed state to healthy unless - * the parent was degraded. */ - if (vd->vdev_ops->vdev_op_leaf && - ((save_state > VDEV_STATE_CLOSED) || - (vd->vdev_state < VDEV_STATE_HEALTHY) || - (vd->vdev_parent != NULL && - vd->vdev_parent->vdev_prevstate == VDEV_STATE_DEGRADED))) { - zfs_post_state_change(spa, vd, save_state); + if (vd->vdev_ops->vdev_op_leaf) { + /* preserve original state from a vdev_reopen() */ + if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && + (vd->vdev_prevstate != vd->vdev_state) && + (save_state <= VDEV_STATE_CLOSED)) + save_state = vd->vdev_prevstate; + + /* filter out state change due to initial vdev_open */ + if (save_state > VDEV_STATE_CLOSED) + zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 0d508c0b8..5b6bea7ae 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -112,6 +112,33 @@ zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) fm_nvlist_destroy(detector, FM_NVA_FREE); } +/* + * We want to rate limit ZIO delay and checksum events so as to not + * flood ZED when a disk is acting up. + * + * Returns 1 if we're ratelimiting, 0 if not. + */ +static int +zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) +{ + int rc = 0; + /* + * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we + * are. Invert it to get our return value. + */ + if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + rc = !zfs_ratelimit(&vd->vdev_delay_rl); + } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { + rc = !zfs_ratelimit(&vd->vdev_checksum_rl); + } + + if (rc) { + /* We're rate limiting */ + fm_erpt_dropped_increment(); + } + + return (rc); +} static void zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, @@ -191,6 +218,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, return; } + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + !zio->io_timestamp) { + /* Ignore bogus delay events */ + return; + } + /* * Serialize ereport generation */ @@ -738,6 +771,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, if (ereport == NULL) return; + if (zfs_is_ratelimiting_event(subclass, vd)) + return; + /* Cleanup is handled by the callback function */ zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif @@ -748,7 +784,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) { - zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); + zio_cksum_report_t *report; + + +#ifdef _KERNEL + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return; +#endif + + report = kmem_zalloc(sizeof (*report), KM_SLEEP); if (zio->io_vsd != NULL) zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); |