Multipath autoreplace, control enclosure LEDs, event rate limiting

1. Enable multipath autoreplace support for FMA. This extends FMA autoreplace to work with multipath disks. This requires libdevmapper to be installed at build time. 2. Turn on/off fault LEDs when VDEVs become degraded/faulted/online Set ZED_USE_ENCLOSURE_LEDS=1 in zed.rc to have ZED turn on/off the enclosure LED for a drive when a drive becomes FAULTED/DEGRADED. Your enclosure must be supported by the Linux SES driver for this to work. The enclosure LED scripts work for multipath devices as well. The scripts will clear the LED when the fault is cleared. 3. Rate limit ZIO delay and checksum events so as not to flood ZED ZIO delay and checksum events are rate limited to 5/sec in the zfs module. Reviewed-by: Richard Laager <[email protected]> Reviewed by: Don Brady <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Tony Hutter <[email protected]> Closes #2449 Closes #3017 Closes #5159
author: Tony Hutter <[email protected]> 2016-10-19 12:55:59 -0700
committer: Brian Behlendorf <[email protected]> 2016-10-19 12:55:59 -0700
commit: 6078881aa18a45ea065a887e2a8606279cdc0329 (patch)
tree: d6af96c545969994afdf2bf84ee1484b09cdf76c /module
parent: 7c502b0b1de8d3d341c026760df5915ad4be794a (diff)
4 files changed, 151 insertions, 10 deletions
diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c
index 6d0314fa7..704ef84c7 100644
--- a/module/zcommon/zfs_comutil.c
+++ b/module/zcommon/zfs_comutil.c
@@ -40,6 +40,7 @@
 #include <sys/int_limits.h>
 #include <sys/nvpair.h>
 #include "zfs_comutil.h"
+#include <sys/zfs_ratelimit.h>
 
 /*
  * Are there allocatable vdevs?
@@ -206,10 +207,73 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
 	"pool split",
 };
 
+/*
+ * Initialize rate limit struct
+ *
+ * rl:		zfs_ratelimit_t struct
+ * burst:	Number to allow in an interval before rate limiting
+ * interval:	Interval time in seconds
+ */
+void
+zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int burst,
+    unsigned int interval)
+{
+	rl->count = 0;
+	rl->start = 0;
+	rl->interval = interval;
+	rl->burst = burst;
+	mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * Re-implementation of the kernel's __ratelimit() function
+ *
+ * We had to write our own rate limiter because the kernel's __ratelimit()
+ * function annoyingly prints out how many times it rate limited to the kernel
+ * logs (and there's no way to turn it off):
+ *
+ * 	__ratelimit: 59 callbacks suppressed
+ *
+ * If the kernel ever allows us to disable these prints, we should go back to
+ * using __ratelimit() instead.
+ *
+ * Return values are the same as __ratelimit():
+ *
+ * 0: If we're rate limiting
+ * 1: If we're not rate limiting.
+ */
+int
+zfs_ratelimit(zfs_ratelimit_t *rl)
+{
+	hrtime_t now;
+	hrtime_t elapsed;
+	int rc = 1;
+
+	mutex_enter(&rl->lock);
+
+	now = gethrtime();
+	elapsed = now - rl->start;
+
+	rl->count++;
+	if (NSEC2SEC(elapsed) >= rl->interval) {
+		rl->start = now;
+		rl->count = 0;
+	} else {
+		if (rl->count >= rl->burst) {
+			rc = 0;	/* We're ratelimiting */
+		}
+	}
+	mutex_exit(&rl->lock);
+
+	return (rc);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zfs_allocatable_devs);
 EXPORT_SYMBOL(zpool_get_rewind_policy);
 EXPORT_SYMBOL(zfs_zpl_version_map);
 EXPORT_SYMBOL(zfs_spa_version_map);
 EXPORT_SYMBOL(zfs_history_event_names);
+EXPORT_SYMBOL(zfs_ratelimit_init);
+EXPORT_SYMBOL(zfs_ratelimit);
 #endif
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index a1069d140..6c569ffc4 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -84,6 +84,9 @@ static int zevent_len_cur = 0;
 static int zevent_waiters = 0;
 static int zevent_flags = 0;
 
+/* Num events rate limited since the last time zfs_zevent_next() was called */
+static uint64_t ratelimit_dropped = 0;
+
 /*
  * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
  * posted.  The posted EIDs are monotonically increasing but not persistent.
@@ -654,6 +657,12 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
 	list_insert_head(&ev->ev_ze_list, ze);
 	(void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
 	*dropped = ze->ze_dropped;
+
+#ifdef _KERNEL
+	/* Include events dropped due to rate limiting */
+	*dropped += ratelimit_dropped;
+	ratelimit_dropped = 0;
+#endif
 	ze->ze_dropped = 0;
 out:
 	mutex_exit(&zevent_lock);
@@ -1587,6 +1596,19 @@ fm_ena_time_get(uint64_t ena)
 }
 
 #ifdef _KERNEL
+/*
+ * Helper function to increment ereport dropped count.  Used by the event
+ * rate limiting code to give feedback to the user about how many events were
+ * rate limited by including them in the 'dropped' count.
+ */
+void
+fm_erpt_dropped_increment(void)
+{
+	atomic_inc_64(&ratelimit_dropped);
+}
+#endif
+
+#ifdef _KERNEL
 void
 fm_init(void)
 {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5ff5cf3b1..f7e91430f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -44,6 +44,7 @@
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/zvol.h>
+#include <sys/zfs_ratelimit.h>
 
 /*
  * When a vdev is added, it will be divided into approximately (but no
@@ -346,12 +347,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 
+	/*
+	 * Initialize rate limit structs for events.  We rate limit ZIO delay
+	 * and checksum events so that we don't overwhelm ZED with thousands
+	 * of events when a disk is acting up.
+	 */
+	zfs_ratelimit_init(&vd->vdev_delay_rl, DELAYS_PER_SECOND, 1);
+	zfs_ratelimit_init(&vd->vdev_checksum_rl, CHECKSUMS_PER_SECOND, 1);
+
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	for (t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 		    &vd->vdev_dtl_lock);
@@ -2221,7 +2231,6 @@ vdev_load(vdev_t *vd)
 	    vdev_metaslab_init(vd, 0) != 0))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
-
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
@@ -3458,15 +3467,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 	/*
 	 * Notify ZED of any significant state-change on a leaf vdev.
 	 *
-	 * We ignore transitions from a closed state to healthy unless
-	 * the parent was degraded.
 	 */
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    ((save_state > VDEV_STATE_CLOSED) ||
-	    (vd->vdev_state < VDEV_STATE_HEALTHY) ||
-	    (vd->vdev_parent != NULL &&
-	    vd->vdev_parent->vdev_prevstate == VDEV_STATE_DEGRADED))) {
-		zfs_post_state_change(spa, vd, save_state);
+	if (vd->vdev_ops->vdev_op_leaf) {
+		/* preserve original state from a vdev_reopen() */
+		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
+		    (vd->vdev_prevstate != vd->vdev_state) &&
+		    (save_state <= VDEV_STATE_CLOSED))
+			save_state = vd->vdev_prevstate;
+
+		/* filter out state change due to initial vdev_open */
+		if (save_state > VDEV_STATE_CLOSED)
+			zfs_post_state_change(spa, vd, save_state);
 	}
 
 	if (!isopen && vd->vdev_parent)
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 0d508c0b8..5b6bea7ae 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -112,6 +112,33 @@ zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
 		fm_nvlist_destroy(detector, FM_NVA_FREE);
 }
 
+/*
+ * We want to rate limit ZIO delay and checksum events so as to not
+ * flood ZED when a disk is acting up.
+ *
+ * Returns 1 if we're ratelimiting, 0 if not.
+ */
+static int
+zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
+{
+	int rc = 0;
+	/*
+	 * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
+	 * are.  Invert it to get our return value.
+	 */
+	if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+		rc = !zfs_ratelimit(&vd->vdev_delay_rl);
+	} else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+		rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
+	}
+
+	if (rc)	{
+		/* We're rate limiting */
+		fm_erpt_dropped_increment();
+	}
+
+	return (rc);
+}
 
 static void
 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
@@ -191,6 +218,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		return;
 	}
 
+	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+	    !zio->io_timestamp) {
+		/* Ignore bogus delay events */
+		return;
+	}
+
 	/*
 	 * Serialize ereport generation
 	 */
@@ -738,6 +771,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	if (ereport == NULL)
 		return;
 
+	if (zfs_is_ratelimiting_event(subclass, vd))
+		return;
+
 	/* Cleanup is handled by the callback function */
 	zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 #endif
@@ -748,7 +784,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     struct zio *zio, uint64_t offset, uint64_t length, void *arg,
     zio_bad_cksum_t *info)
 {
-	zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+	zio_cksum_report_t *report;
+
+
+#ifdef _KERNEL
+	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+		return;
+#endif
+
+	report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 
 	if (zio->io_vsd != NULL)
 		zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
author	Tony Hutter <[email protected]>	2016-10-19 12:55:59 -0700
committer	Brian Behlendorf <[email protected]>	2016-10-19 12:55:59 -0700
commit	6078881aa18a45ea065a887e2a8606279cdc0329 (patch)
tree	d6af96c545969994afdf2bf84ee1484b09cdf76c /module
parent	7c502b0b1de8d3d341c026760df5915ad4be794a (diff)