summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c8
-rw-r--r--module/zfs/fm.c14
-rw-r--r--module/zfs/spa.c4
-rw-r--r--module/zfs/spa_config.c4
-rw-r--r--module/zfs/vdev.c7
-rw-r--r--module/zfs/vdev_indirect.c13
-rw-r--r--module/zfs/vdev_raidz.c35
-rw-r--r--module/zfs/zfs_fm.c361
-rw-r--r--module/zfs/zfs_ioctl.c1
-rw-r--r--module/zfs/zio.c42
10 files changed, 418 insertions, 71 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 0c33a4535..12837104a 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2019, Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020, Delphix. All rights reserved.
* Copyright (c) 2014, Saso Kiselkov. All rights reserved.
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2019, loli10K <[email protected]>. All rights reserved.
@@ -2188,7 +2188,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
ret = SET_ERROR(EIO);
spa_log_error(spa, zb);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
- spa, NULL, zb, NULL, 0, 0);
+ spa, NULL, zb, NULL, 0);
}
return (ret);
@@ -5654,7 +5654,7 @@ arc_read_done(zio_t *zio)
spa_log_error(zio->io_spa, &acb->acb_zb);
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
- zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
+ zio->io_spa, NULL, &acb->acb_zb, zio, 0);
}
}
@@ -5931,7 +5931,7 @@ top:
spa_log_error(spa, zb);
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
- spa, NULL, zb, NULL, 0, 0);
+ spa, NULL, zb, NULL, 0);
}
}
if (rc != 0) {
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index c00e08b8d..a5003f85d 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -104,13 +104,15 @@ struct erpt_kstat {
kstat_named_t erpt_set_failed; /* num erpt set failures */
kstat_named_t fmri_set_failed; /* num fmri set failures */
kstat_named_t payload_set_failed; /* num payload set failures */
+ kstat_named_t erpt_duplicates; /* num duplicate erpts */
};
static struct erpt_kstat erpt_kstat_data = {
{ "erpt-dropped", KSTAT_DATA_UINT64 },
{ "erpt-set-failed", KSTAT_DATA_UINT64 },
{ "fmri-set-failed", KSTAT_DATA_UINT64 },
- { "payload-set-failed", KSTAT_DATA_UINT64 }
+ { "payload-set-failed", KSTAT_DATA_UINT64 },
+ { "erpt-duplicates", KSTAT_DATA_UINT64 }
};
kstat_t *fm_ksp;
@@ -568,6 +570,12 @@ out:
return (error);
}
+void
+zfs_zevent_track_duplicate(void)
+{
+ atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
+}
+
static int
zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
{
@@ -1633,6 +1641,8 @@ fm_init(void)
list_create(&zevent_list, sizeof (zevent_t),
offsetof(zevent_t, ev_node));
cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
+
+ zfs_ereport_init();
}
void
@@ -1640,6 +1650,8 @@ fm_fini(void)
{
int count;
+ zfs_ereport_fini();
+
zfs_zevent_drain_all(&count);
mutex_enter(&zevent_lock);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 31fa52d1d..015996d15 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -2868,7 +2868,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
}
if (error != EBADF) {
(void) zfs_ereport_post(ereport, spa,
- NULL, NULL, NULL, 0, 0);
+ NULL, NULL, NULL, 0);
}
}
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index b98b7badb..81059c69d 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
*/
@@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
if (target->spa_ccw_fail_time == 0) {
(void) zfs_ereport_post(
FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
- target, NULL, NULL, NULL, 0, 0);
+ target, NULL, NULL, NULL, 0);
}
target->spa_ccw_fail_time = gethrtime();
spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 95a2f5947..0a3b8bd83 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio)
ASSERT(zio->io_error != 0);
vdev_dbgmsg(vd, "failed probe");
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
- spa, vd, NULL, NULL, 0, 0);
+ spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
}
@@ -1862,11 +1862,10 @@ vdev_open(vdev_t *vd)
vd->vdev_ops->vdev_op_leaf) {
(void) zfs_ereport_post(
FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
- spa, vd, NULL, NULL, 0, 0);
+ spa, vd, NULL, NULL, 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (SET_ERROR(EDOM));
-
}
vd->vdev_max_asize = max_asize;
}
@@ -4759,7 +4758,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
}
(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
- save_state, 0);
+ save_state);
}
/* Erase any notion of persistent removed state */
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 5301e0665..12ee393bd 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -16,7 +16,7 @@
/*
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
* Copyright (c) 2019, loli10K <[email protected]>. All rights reserved.
- * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -1473,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
vdev_t *vd = ic->ic_vdev;
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
- (void) zfs_ereport_post_checksum(zio->io_spa, vd,
+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
NULL, zio, is->is_target_offset, is->is_size,
NULL, NULL, NULL);
+ if (ret != EALREADY) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
}
}
}
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 4320078b6..47312e02f 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
*/
@@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd;
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
- (void) zfs_ereport_post_checksum(zio->io_spa, vd,
+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
&zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
rc->rc_abd, bad_data, &zbc);
+ if (ret != EALREADY) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
}
}
@@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio)
vdev_t *cvd;
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
- if (rc->rc_error == 0) {
- zio_bad_cksum_t zbc;
- zbc.zbc_has_cksum = 0;
- zbc.zbc_injected =
- rm->rm_ecksuminjected;
+ if (rc->rc_error != 0)
+ continue;
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ int ret = zfs_ereport_start_checksum(
+ zio->io_spa, cvd, &zio->io_bookmark, zio,
+ rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
+ if (ret != EALREADY) {
mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock);
-
- zfs_ereport_start_checksum(
- zio->io_spa, cvd,
- &zio->io_bookmark, zio,
- rc->rc_offset, rc->rc_size,
- (void *)(uintptr_t)c, &zbc);
}
}
}
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index ad13ccedf..a8341f50b 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012,2020 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -101,7 +101,251 @@
* good and bad versions of the buffer (if available), and we annotate the
* ereport with information about the differences.
*/
+
#ifdef _KERNEL
+/*
+ * Duplicate ereport Detection
+ *
+ * Some ereports are retained momentarily for detecting duplicates. These
+ * are kept in a recent_events_node_t in both a time-ordered list and an AVL
+ * tree of recent unique ereports.
+ *
+ * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
+ * task is used to purge stale entries.
+ */
+static list_t recent_events_list;
+static avl_tree_t recent_events_tree;
+static kmutex_t recent_events_lock;
+static taskqid_t recent_events_cleaner_tqid;
+
+/*
+ * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
+ *
+ * This setting can be changed dynamically and setting it to zero
+ * disables duplicate detection.
+ */
+unsigned int zfs_zevent_retain_max = 2000;
+
+/*
+ * The lifespan for a recent ereport entry. The default of 15 minutes is
+ * intended to outlive the zfs diagnosis engine's threshold of 10 errors
+ * over a period of 10 minutes.
+ */
+unsigned int zfs_zevent_retain_expire_secs = 900;
+
+typedef enum zfs_subclass {
+ ZSC_IO,
+ ZSC_DATA,
+ ZSC_CHECKSUM
+} zfs_subclass_t;
+
+typedef struct {
+ /* common criteria */
+ uint64_t re_pool_guid;
+ uint64_t re_vdev_guid;
+ int re_io_error;
+ uint64_t re_io_size;
+ uint64_t re_io_offset;
+ zfs_subclass_t re_subclass;
+ zio_priority_t re_io_priority;
+
+ /* logical zio criteria (optional) */
+ zbookmark_phys_t re_io_bookmark;
+
+ /* internal state */
+ avl_node_t re_tree_link;
+ list_node_t re_list_link;
+ uint64_t re_timestamp;
+} recent_events_node_t;
+
+static int
+recent_events_compare(const void *a, const void *b)
+{
+ const recent_events_node_t *node1 = a;
+ const recent_events_node_t *node2 = b;
+ int cmp;
+
+ /*
+ * The comparison order here is somewhat arbitrary.
+ * What's important is that if every criteria matches, then it
+ * is a duplicate (i.e. compare returns 0)
+ */
+ if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
+ return (cmp);
+
+ const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
+ const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
+
+ if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
+ return (cmp);
+
+ return (0);
+}
+
+static void zfs_ereport_schedule_cleaner(void);
+
+/*
+ * background task to clean stale recent event nodes.
+ */
+/*ARGSUSED*/
+static void
+zfs_ereport_cleaner(void *arg)
+{
+ recent_events_node_t *entry;
+ uint64_t now = gethrtime();
+
+ /*
+ * purge expired entries
+ */
+ mutex_enter(&recent_events_lock);
+ while ((entry = list_tail(&recent_events_list)) != NULL) {
+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+ if (age <= zfs_zevent_retain_expire_secs)
+ break;
+
+ /* remove expired node */
+ avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
+ kmem_free(entry, sizeof (*entry));
+ }
+
+ /* Restart the cleaner if more entries remain */
+ recent_events_cleaner_tqid = 0;
+ if (!list_is_empty(&recent_events_list))
+ zfs_ereport_schedule_cleaner();
+
+ mutex_exit(&recent_events_lock);
+}
+
+static void
+zfs_ereport_schedule_cleaner(void)
+{
+ ASSERT(MUTEX_HELD(&recent_events_lock));
+
+ uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
+
+ recent_events_cleaner_tqid = taskq_dispatch_delay(
+ system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
+ ddi_get_lbolt() + NSEC_TO_TICK(timeout));
+}
+
+/*
+ * Check if an ereport would be a duplicate of one recently posted.
+ *
+ * An ereport is considered a duplicate if the set of criteria in
+ * recent_events_node_t all match.
+ *
+ * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
+ * are candidates for duplicate checking.
+ */
+static boolean_t
+zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
+{
+ recent_events_node_t search = {0}, *entry;
+
+ if (vd == NULL || zio == NULL)
+ return (B_FALSE);
+
+ if (zfs_zevent_retain_max == 0)
+ return (B_FALSE);
+
+ if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
+ search.re_subclass = ZSC_IO;
+ else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
+ search.re_subclass = ZSC_DATA;
+ else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
+ search.re_subclass = ZSC_CHECKSUM;
+ else
+ return (B_FALSE);
+
+ search.re_pool_guid = spa_guid(spa);
+ search.re_vdev_guid = vd->vdev_guid;
+ search.re_io_error = zio->io_error;
+ search.re_io_priority = zio->io_priority;
+ /* if size is supplied use it over what's in zio */
+ if (size) {
+ search.re_io_size = size;
+ search.re_io_offset = offset;
+ } else {
+ search.re_io_size = zio->io_size;
+ search.re_io_offset = zio->io_offset;
+ }
+
+ /* grab optional logical zio criteria */
+ if (zb != NULL) {
+ search.re_io_bookmark.zb_objset = zb->zb_objset;
+ search.re_io_bookmark.zb_object = zb->zb_object;
+ search.re_io_bookmark.zb_level = zb->zb_level;
+ search.re_io_bookmark.zb_blkid = zb->zb_blkid;
+ }
+
+ uint64_t now = gethrtime();
+
+ mutex_enter(&recent_events_lock);
+
+ /* check if we have seen this one recently */
+ entry = avl_find(&recent_events_tree, &search, NULL);
+ if (entry != NULL) {
+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+
+ /*
+ * There is still an active cleaner (since we're here).
+ * Reset the last seen time for this duplicate entry
+ * so that its lifespand gets extended.
+ */
+ list_remove(&recent_events_list, entry);
+ list_insert_head(&recent_events_list, entry);
+ entry->re_timestamp = now;
+
+ zfs_zevent_track_duplicate();
+ mutex_exit(&recent_events_lock);
+
+ return (age <= zfs_zevent_retain_expire_secs);
+ }
+
+ if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
+ /* recycle oldest node */
+ entry = list_tail(&recent_events_list);
+ ASSERT(entry != NULL);
+ list_remove(&recent_events_list, entry);
+ avl_remove(&recent_events_tree, entry);
+ } else {
+ entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
+ }
+
+ /* record this as a recent ereport */
+ *entry = search;
+ avl_add(&recent_events_tree, entry);
+ list_insert_head(&recent_events_list, entry);
+ entry->re_timestamp = now;
+
+ /* Start a cleaner if not already scheduled */
+ if (recent_events_cleaner_tqid == 0)
+ zfs_ereport_schedule_cleaner();
+
+ mutex_exit(&recent_events_lock);
+ return (B_FALSE);
+}
+
void
zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
{
@@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
uint64_t ena;
char class[64];
- if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
- return (B_FALSE);
-
if ((ereport = fm_nvlist_create(NULL)) == NULL)
return (B_FALSE);
@@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, zio->io_timestamp, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
DATA_TYPE_UINT64, zio->io_delta, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
+ DATA_TYPE_UINT32, zio->io_priority, NULL);
/*
* If the 'size' parameter is non-zero, it indicates this is a
@@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
}
/*
- * Return 0 if event was posted, EINVAL if there was a problem posting it or
- * EBUSY if the event was rate limited.
+ * Post an ereport for the given subclass
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
*/
int
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
- const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
- uint64_t size)
+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
{
int rc = 0;
#ifdef _KERNEL
nvlist_t *ereport = NULL;
nvlist_t *detector = NULL;
+ if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
+ return (EINVAL);
+
+ if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
+ return (SET_ERROR(EALREADY));
+
if (zfs_is_ratelimiting_event(subclass, vd))
return (SET_ERROR(EBUSY));
if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
- zb, zio, stateoroffset, size))
+ zb, zio, state, 0))
return (SET_ERROR(EINVAL)); /* couldn't post event */
if (ereport == NULL)
@@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
return (rc);
}
-void
+/*
+ * Prepare a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, void *arg,
zio_bad_cksum_t *info)
@@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
zio_cksum_report_t *report;
#ifdef _KERNEL
+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+ offset, length))
+ return (SET_ERROR(EALREADY));
+
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
- return;
+ return (SET_ERROR(EBUSY));
#endif
report = kmem_zalloc(sizeof (*report), KM_SLEEP);
@@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
if (report->zcr_ereport == NULL) {
zfs_ereport_free_checksum(report);
- return;
+ return (0);
}
#endif
@@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
report->zcr_next = zio->io_logical->io_cksum_report;
zio->io_logical->io_cksum_report = report;
mutex_exit(&spa->spa_errlist_lock);
+ return (0);
}
void
@@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
kmem_free(rpt, sizeof (*rpt));
}
-
+/*
+ * Post a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
int
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length,
@@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
nvlist_t *detector = NULL;
zfs_ecksum_info_t *info;
+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+ offset, length))
+ return (SET_ERROR(EALREADY));
+
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
- return (EBUSY);
+ return (SET_ERROR(EBUSY));
if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
@@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
#endif
}
-#if defined(_KERNEL)
+#ifdef _KERNEL
+void
+zfs_ereport_init(void)
+{
+ mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&recent_events_list, sizeof (recent_events_node_t),
+ offsetof(recent_events_node_t, re_list_link));
+ avl_create(&recent_events_tree, recent_events_compare,
+ sizeof (recent_events_node_t), offsetof(recent_events_node_t,
+ re_tree_link));
+}
+
+/*
+ * This 'early' fini needs to run before zfs_fini() which on Linux waits
+ * for the system_delay_taskq to drain.
+ */
+void
+zfs_ereport_taskq_fini(void)
+{
+ mutex_enter(&recent_events_lock);
+ if (recent_events_cleaner_tqid != 0) {
+ taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
+ recent_events_cleaner_tqid = 0;
+ }
+ mutex_exit(&recent_events_lock);
+}
+
+void
+zfs_ereport_fini(void)
+{
+ recent_events_node_t *entry;
+
+ while ((entry = list_head(&recent_events_list)) != NULL) {
+ avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
+ kmem_free(entry, sizeof (*entry));
+ }
+ avl_destroy(&recent_events_tree);
+ list_destroy(&recent_events_list);
+ mutex_destroy(&recent_events_lock);
+}
+
EXPORT_SYMBOL(zfs_ereport_post);
EXPORT_SYMBOL(zfs_ereport_is_valid);
EXPORT_SYMBOL(zfs_ereport_post_checksum);
EXPORT_SYMBOL(zfs_post_remove);
EXPORT_SYMBOL(zfs_post_autoreplace);
EXPORT_SYMBOL(zfs_post_state_change);
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
+ "Maximum recent zevents records to retain for duplicate checking");
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
+ "Expiration time for recent zevents records");
#endif /* _KERNEL */
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 495ff4707..c9322a826 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -7615,6 +7615,7 @@ zfs_kmod_fini(void)
kmem_free(zs, sizeof (zfsdev_state_t));
}
+ zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */
zfs_fini();
spa_fini();
zvol_fini();
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f956a9ef7..8a8fbccd7 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Klara Inc.
@@ -547,7 +547,7 @@ error:
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
- spa, NULL, &zio->io_bookmark, zio, 0, 0);
+ spa, NULL, &zio->io_bookmark, zio, 0);
}
} else {
zio->io_error = ret;
@@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
pio->io_offset, pio->io_size, pio->io_error);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
- pio->io_spa, vd, zb, pio, 0, 0);
+ pio->io_spa, vd, zb, pio, 0);
if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
taskq_empty_ent(&pio->io_tqent)) {
@@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and has been suspended.\n", spa_name(spa));
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
- NULL, NULL, 0, 0);
+ NULL, NULL, 0);
mutex_enter(&spa->spa_suspend_lock);
@@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio)
zio->io_error = error;
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- mutex_enter(&zio->io_vd->vdev_stat_lock);
- zio->io_vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&zio->io_vd->vdev_stat_lock);
-
- zfs_ereport_start_checksum(zio->io_spa,
+ int ret = zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, NULL, &info);
+
+ if (ret != EALREADY) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ }
}
}
@@ -4543,7 +4545,7 @@ zio_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, &zio->io_bookmark,
- zio, 0, 0);
+ zio, 0);
}
}
}
@@ -4557,16 +4559,16 @@ zio_done(zio_t *zio)
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd)) {
- mutex_enter(&zio->io_vd->vdev_stat_lock);
- if (zio->io_type == ZIO_TYPE_READ) {
- zio->io_vd->vdev_stat.vs_read_errors++;
- } else if (zio->io_type == ZIO_TYPE_WRITE) {
- zio->io_vd->vdev_stat.vs_write_errors++;
+ int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
+ if (ret != EALREADY) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vd->vdev_stat.vs_read_errors++;
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ zio->io_vd->vdev_stat.vs_write_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
}
- mutex_exit(&zio->io_vd->vdev_stat_lock);
-
- (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
- zio->io_vd, &zio->io_bookmark, zio, 0, 0);
}
if ((zio->io_error == EIO || !(zio->io_flags &
@@ -4578,7 +4580,7 @@ zio_done(zio_t *zio)
*/
spa_log_error(zio->io_spa, &zio->io_bookmark);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
- zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);
+ zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
}
}