summaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
Diffstat (limited to 'module')
-rw-r--r--module/zcommon/zfs_comutil.c64
-rw-r--r--module/zfs/fm.c22
-rw-r--r--module/zfs/vdev.c29
-rw-r--r--module/zfs/zfs_fm.c46
4 files changed, 151 insertions, 10 deletions
diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c
index 6d0314fa7..704ef84c7 100644
--- a/module/zcommon/zfs_comutil.c
+++ b/module/zcommon/zfs_comutil.c
@@ -40,6 +40,7 @@
#include <sys/int_limits.h>
#include <sys/nvpair.h>
#include "zfs_comutil.h"
+#include <sys/zfs_ratelimit.h>
/*
* Are there allocatable vdevs?
@@ -206,10 +207,73 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
"pool split",
};
+/*
+ * Initialize rate limit struct
+ *
+ * rl: zfs_ratelimit_t struct
+ * burst: Number to allow in an interval before rate limiting
+ * interval: Interval time in seconds
+ */
+void
+zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int burst,
+ unsigned int interval)
+{
+ rl->count = 0;
+ rl->start = 0;
+ rl->interval = interval;
+ rl->burst = burst;
+ mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * Re-implementation of the kernel's __ratelimit() function
+ *
+ * We had to write our own rate limiter because the kernel's __ratelimit()
+ * function annoyingly prints out how many times it rate limited to the kernel
+ * logs (and there's no way to turn it off):
+ *
+ * __ratelimit: 59 callbacks suppressed
+ *
+ * If the kernel ever allows us to disable these prints, we should go back to
+ * using __ratelimit() instead.
+ *
+ * Return values are the same as __ratelimit():
+ *
+ * 0: If we're rate limiting
+ * 1: If we're not rate limiting.
+ */
+int
+zfs_ratelimit(zfs_ratelimit_t *rl)
+{
+ hrtime_t now;
+ hrtime_t elapsed;
+ int rc = 1;
+
+ mutex_enter(&rl->lock);
+
+ now = gethrtime();
+ elapsed = now - rl->start;
+
+ rl->count++;
+ if (NSEC2SEC(elapsed) >= rl->interval) {
+ rl->start = now;
+ rl->count = 0;
+ } else {
+ if (rl->count >= rl->burst) {
+ rc = 0; /* We're ratelimiting */
+ }
+ }
+ mutex_exit(&rl->lock);
+
+ return (rc);
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(zfs_allocatable_devs);
EXPORT_SYMBOL(zpool_get_rewind_policy);
EXPORT_SYMBOL(zfs_zpl_version_map);
EXPORT_SYMBOL(zfs_spa_version_map);
EXPORT_SYMBOL(zfs_history_event_names);
+EXPORT_SYMBOL(zfs_ratelimit_init);
+EXPORT_SYMBOL(zfs_ratelimit);
#endif
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index a1069d140..6c569ffc4 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -84,6 +84,9 @@ static int zevent_len_cur = 0;
static int zevent_waiters = 0;
static int zevent_flags = 0;
+/* Num events rate limited since the last time zfs_zevent_next() was called */
+static uint64_t ratelimit_dropped = 0;
+
/*
* The EID (Event IDentifier) is used to uniquely tag a zevent when it is
* posted. The posted EIDs are monotonically increasing but not persistent.
@@ -654,6 +657,12 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
list_insert_head(&ev->ev_ze_list, ze);
(void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
*dropped = ze->ze_dropped;
+
+#ifdef _KERNEL
+ /* Include events dropped due to rate limiting */
+ *dropped += ratelimit_dropped;
+ ratelimit_dropped = 0;
+#endif
ze->ze_dropped = 0;
out:
mutex_exit(&zevent_lock);
@@ -1587,6 +1596,19 @@ fm_ena_time_get(uint64_t ena)
}
#ifdef _KERNEL
+/*
+ * Helper function to increment ereport dropped count. Used by the event
+ * rate limiting code to give feedback to the user about how many events were
+ * rate limited by including them in the 'dropped' count.
+ */
+void
+fm_erpt_dropped_increment(void)
+{
+ atomic_inc_64(&ratelimit_dropped);
+}
+#endif
+
+#ifdef _KERNEL
void
fm_init(void)
{
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5ff5cf3b1..f7e91430f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -44,6 +44,7 @@
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/zvol.h>
+#include <sys/zfs_ratelimit.h>
/*
* When a vdev is added, it will be divided into approximately (but no
@@ -346,12 +347,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_state = VDEV_STATE_CLOSED;
vd->vdev_ishole = (ops == &vdev_hole_ops);
+ /*
+ * Initialize rate limit structs for events. We rate limit ZIO delay
+ * and checksum events so that we don't overwhelm ZED with thousands
+ * of events when a disk is acting up.
+ */
+ zfs_ratelimit_init(&vd->vdev_delay_rl, DELAYS_PER_SECOND, 1);
+ zfs_ratelimit_init(&vd->vdev_checksum_rl, CHECKSUMS_PER_SECOND, 1);
+
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
for (t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
&vd->vdev_dtl_lock);
@@ -2221,7 +2231,6 @@ vdev_load(vdev_t *vd)
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
-
/*
* If this is a leaf vdev, load its DTL.
*/
@@ -3458,15 +3467,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
/*
* Notify ZED of any significant state-change on a leaf vdev.
*
- * We ignore transitions from a closed state to healthy unless
- * the parent was degraded.
*/
- if (vd->vdev_ops->vdev_op_leaf &&
- ((save_state > VDEV_STATE_CLOSED) ||
- (vd->vdev_state < VDEV_STATE_HEALTHY) ||
- (vd->vdev_parent != NULL &&
- vd->vdev_parent->vdev_prevstate == VDEV_STATE_DEGRADED))) {
- zfs_post_state_change(spa, vd, save_state);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ /* preserve original state from a vdev_reopen() */
+ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
+ (vd->vdev_prevstate != vd->vdev_state) &&
+ (save_state <= VDEV_STATE_CLOSED))
+ save_state = vd->vdev_prevstate;
+
+ /* filter out state change due to initial vdev_open */
+ if (save_state > VDEV_STATE_CLOSED)
+ zfs_post_state_change(spa, vd, save_state);
}
if (!isopen && vd->vdev_parent)
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 0d508c0b8..5b6bea7ae 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -112,6 +112,33 @@ zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
fm_nvlist_destroy(detector, FM_NVA_FREE);
}
+/*
+ * We want to rate limit ZIO delay and checksum events so as to not
+ * flood ZED when a disk is acting up.
+ *
+ * Returns 1 if we're ratelimiting, 0 if not.
+ */
+static int
+zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
+{
+ int rc = 0;
+ /*
+ * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
+ * are. Invert it to get our return value.
+ */
+ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+ rc = !zfs_ratelimit(&vd->vdev_delay_rl);
+ } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+ rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
+ }
+
+ if (rc) {
+ /* We're rate limiting */
+ fm_erpt_dropped_increment();
+ }
+
+ return (rc);
+}
static void
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
@@ -191,6 +218,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
return;
}
+ if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+ !zio->io_timestamp) {
+ /* Ignore bogus delay events */
+ return;
+ }
+
/*
* Serialize ereport generation
*/
@@ -738,6 +771,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
if (ereport == NULL)
return;
+ if (zfs_is_ratelimiting_event(subclass, vd))
+ return;
+
/* Cleanup is handled by the callback function */
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
#endif
@@ -748,7 +784,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
struct zio *zio, uint64_t offset, uint64_t length, void *arg,
zio_bad_cksum_t *info)
{
- zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+ zio_cksum_report_t *report;
+
+
+#ifdef _KERNEL
+ if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+ return;
+#endif
+
+ report = kmem_zalloc(sizeof (*report), KM_SLEEP);
if (zio->io_vsd != NULL)
zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);