diff options
author | Don Brady <[email protected]> | 2020-09-04 11:34:28 -0600 |
---|---|---|
committer | GitHub <[email protected]> | 2020-09-04 10:34:28 -0700 |
commit | 4f0728278615eb42fc5022b2817c082f578e225f (patch) | |
tree | 598cd2bb948dd3b0eb0469139a33269ae58fc40e /module/zfs | |
parent | 3808032489f28c1f36b39c9a3274d5f4b6f9638a (diff) |
Avoid posting duplicate zpool events
Duplicate io and checksum ereport events can misrepresent that
things are worse than they seem. Ideally the zpool events and the
corresponding vdev stat error counts in a zpool status should be
for unique errors -- not the same error being counted over and over.
This can be demonstrated in a simple example. With a single bad
block in a datafile and just 5 reads of the file we end up with a
degraded vdev, even though there is only one unique error in the pool.
The proposed solution to the above issue, is to eliminate duplicates
when posting events and when updating vdev error stats. We now save
recent error events of interest when posting events so that we can
easily check for duplicates when posting an error.
Reviewed by: Brad Lewis <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Closes #10861
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/arc.c | 8 | ||||
-rw-r--r-- | module/zfs/fm.c | 14 | ||||
-rw-r--r-- | module/zfs/spa.c | 4 | ||||
-rw-r--r-- | module/zfs/spa_config.c | 4 | ||||
-rw-r--r-- | module/zfs/vdev.c | 7 | ||||
-rw-r--r-- | module/zfs/vdev_indirect.c | 13 | ||||
-rw-r--r-- | module/zfs/vdev_raidz.c | 35 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 361 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 1 | ||||
-rw-r--r-- | module/zfs/zio.c | 42 |
10 files changed, 418 insertions, 71 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 0c33a4535..12837104a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K <[email protected]>. All rights reserved. @@ -2188,7 +2188,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, ret = SET_ERROR(EIO); spa_log_error(spa, zb); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } return (ret); @@ -5654,7 +5654,7 @@ arc_read_done(zio_t *zio) spa_log_error(zio->io_spa, &acb->acb_zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } @@ -5931,7 +5931,7 @@ top: spa_log_error(spa, zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } } if (rc != 0) { diff --git a/module/zfs/fm.c b/module/zfs/fm.c index c00e08b8d..a5003f85d 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -104,13 +104,15 @@ struct erpt_kstat { kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ + kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, - { "payload-set-failed", KSTAT_DATA_UINT64 } + { "payload-set-failed", KSTAT_DATA_UINT64 }, + { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; @@ -568,6 +570,12 @@ out: return (error); } +void +zfs_zevent_track_duplicate(void) +{ + atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); +} + static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { @@ -1633,6 +1641,8 @@ fm_init(void) list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); + + zfs_ereport_init(); } void @@ -1640,6 +1650,8 @@ fm_fini(void) { int count; + zfs_ereport_fini(); + zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 31fa52d1d..015996d15 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -2868,7 +2868,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, - NULL, NULL, NULL, 0, 0); + NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index b98b7badb..81059c69d 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ @@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, NULL, 0, 0); + target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 95a2f5947..0a3b8bd83 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -1862,11 +1862,10 @@ vdev_open(vdev_t *vd) vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); - } vd->vdev_max_asize = max_asize; } @@ -4759,7 +4758,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, - save_state, 0); + save_state); } /* Erase any notion of persistent removed state */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 5301e0665..12ee393bd 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -16,7 +16,7 @@ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K <[email protected]>. All rights reserved. - * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1473,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vdev_t *vd = ic->ic_vdev; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 4320078b6..47312e02f 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } @@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio) vdev_t *cvd; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; + if (rc->rc_error != 0) + continue; + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum( + zio->io_spa, cvd, &zio->io_bookmark, zio, + rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); - - zfs_ereport_start_checksum( - zio->io_spa, cvd, - &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); } } } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index ad13ccedf..a8341f50b 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012,2020 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -101,7 +101,251 @@ * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ + #ifdef _KERNEL +/* + * Duplicate ereport Detection + * + * Some ereports are retained momentarily for detecting duplicates. These + * are kept in a recent_events_node_t in both a time-ordered list and an AVL + * tree of recent unique ereports. + * + * The lifespan of these recent ereports is bounded (15 mins) and a cleaner + * task is used to purge stale entries. + */ +static list_t recent_events_list; +static avl_tree_t recent_events_tree; +static kmutex_t recent_events_lock; +static taskqid_t recent_events_cleaner_tqid; + +/* + * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. + * + * This setting can be changed dynamically and setting it to zero + * disables duplicate detection. + */ +unsigned int zfs_zevent_retain_max = 2000; + +/* + * The lifespan for a recent ereport entry. The default of 15 minutes is + * intended to outlive the zfs diagnosis engine's threshold of 10 errors + * over a period of 10 minutes. + */ +unsigned int zfs_zevent_retain_expire_secs = 900; + +typedef enum zfs_subclass { + ZSC_IO, + ZSC_DATA, + ZSC_CHECKSUM +} zfs_subclass_t; + +typedef struct { + /* common criteria */ + uint64_t re_pool_guid; + uint64_t re_vdev_guid; + int re_io_error; + uint64_t re_io_size; + uint64_t re_io_offset; + zfs_subclass_t re_subclass; + zio_priority_t re_io_priority; + + /* logical zio criteria (optional) */ + zbookmark_phys_t re_io_bookmark; + + /* internal state */ + avl_node_t re_tree_link; + list_node_t re_list_link; + uint64_t re_timestamp; +} recent_events_node_t; + +static int +recent_events_compare(const void *a, const void *b) +{ + const recent_events_node_t *node1 = a; + const recent_events_node_t *node2 = b; + int cmp; + + /* + * The comparison order here is somewhat arbitrary. + * What's important is that if every criteria matches, then it + * is a duplicate (i.e. compare returns 0) + */ + if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) + return (cmp); + + const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; + const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; + + if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) + return (cmp); + + return (0); +} + +static void zfs_ereport_schedule_cleaner(void); + +/* + * background task to clean stale recent event nodes. + */ +/*ARGSUSED*/ +static void +zfs_ereport_cleaner(void *arg) +{ + recent_events_node_t *entry; + uint64_t now = gethrtime(); + + /* + * purge expired entries + */ + mutex_enter(&recent_events_lock); + while ((entry = list_tail(&recent_events_list)) != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + if (age <= zfs_zevent_retain_expire_secs) + break; + + /* remove expired node */ + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + + /* Restart the cleaner if more entries remain */ + recent_events_cleaner_tqid = 0; + if (!list_is_empty(&recent_events_list)) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); +} + +static void +zfs_ereport_schedule_cleaner(void) +{ + ASSERT(MUTEX_HELD(&recent_events_lock)); + + uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); + + recent_events_cleaner_tqid = taskq_dispatch_delay( + system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, + ddi_get_lbolt() + NSEC_TO_TICK(timeout)); +} + +/* + * Check if an ereport would be a duplicate of one recently posted. + * + * An ereport is considered a duplicate if the set of criteria in + * recent_events_node_t all match. + * + * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM + * are candidates for duplicate checking. + */ +static boolean_t +zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) +{ + recent_events_node_t search = {0}, *entry; + + if (vd == NULL || zio == NULL) + return (B_FALSE); + + if (zfs_zevent_retain_max == 0) + return (B_FALSE); + + if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) + search.re_subclass = ZSC_IO; + else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) + search.re_subclass = ZSC_DATA; + else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) + search.re_subclass = ZSC_CHECKSUM; + else + return (B_FALSE); + + search.re_pool_guid = spa_guid(spa); + search.re_vdev_guid = vd->vdev_guid; + search.re_io_error = zio->io_error; + search.re_io_priority = zio->io_priority; + /* if size is supplied use it over what's in zio */ + if (size) { + search.re_io_size = size; + search.re_io_offset = offset; + } else { + search.re_io_size = zio->io_size; + search.re_io_offset = zio->io_offset; + } + + /* grab optional logical zio criteria */ + if (zb != NULL) { + search.re_io_bookmark.zb_objset = zb->zb_objset; + search.re_io_bookmark.zb_object = zb->zb_object; + search.re_io_bookmark.zb_level = zb->zb_level; + search.re_io_bookmark.zb_blkid = zb->zb_blkid; + } + + uint64_t now = gethrtime(); + + mutex_enter(&recent_events_lock); + + /* check if we have seen this one recently */ + entry = avl_find(&recent_events_tree, &search, NULL); + if (entry != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + + /* + * There is still an active cleaner (since we're here). + * Reset the last seen time for this duplicate entry + * so that its lifespand gets extended. + */ + list_remove(&recent_events_list, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + zfs_zevent_track_duplicate(); + mutex_exit(&recent_events_lock); + + return (age <= zfs_zevent_retain_expire_secs); + } + + if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { + /* recycle oldest node */ + entry = list_tail(&recent_events_list); + ASSERT(entry != NULL); + list_remove(&recent_events_list, entry); + avl_remove(&recent_events_tree, entry); + } else { + entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); + } + + /* record this as a recent ereport */ + *entry = search; + avl_add(&recent_events_tree, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + /* Start a cleaner if not already scheduled */ + if (recent_events_cleaner_tqid == 0) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); + return (B_FALSE); +} + void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { @@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) - return (B_FALSE); - if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); @@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a @@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) } /* - * Return 0 if event was posted, EINVAL if there was a problem posting it or - * EBUSY if the event was rate limited. + * Post an ereport for the given subclass + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t size) + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (EINVAL); + + if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size)) + zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) @@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, return (rc); } -void +/* + * Prepare a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ +int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) @@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_cksum_report_t *report; #ifdef _KERNEL + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return; + return (SET_ERROR(EBUSY)); #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); @@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); - return; + return (0); } #endif @@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); + return (0); } void @@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) kmem_free(rpt, sizeof (*rpt)); } - +/* + * Post a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, @@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, nvlist_t *detector = NULL; zfs_ecksum_info_t *info; + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return (EBUSY); + return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { @@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #endif } -#if defined(_KERNEL) +#ifdef _KERNEL +void +zfs_ereport_init(void) +{ + mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&recent_events_list, sizeof (recent_events_node_t), + offsetof(recent_events_node_t, re_list_link)); + avl_create(&recent_events_tree, recent_events_compare, + sizeof (recent_events_node_t), offsetof(recent_events_node_t, + re_tree_link)); +} + +/* + * This 'early' fini needs to run before zfs_fini() which on Linux waits + * for the system_delay_taskq to drain. + */ +void +zfs_ereport_taskq_fini(void) +{ + mutex_enter(&recent_events_lock); + if (recent_events_cleaner_tqid != 0) { + taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); + recent_events_cleaner_tqid = 0; + } + mutex_exit(&recent_events_lock); +} + +void +zfs_ereport_fini(void) +{ + recent_events_node_t *entry; + + while ((entry = list_head(&recent_events_list)) != NULL) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + avl_destroy(&recent_events_tree); + list_destroy(&recent_events_list); + mutex_destroy(&recent_events_lock); +} + EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, + "Maximum recent zevents records to retain for duplicate checking"); +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, + "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 495ff4707..c9322a826 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7615,6 +7615,7 @@ zfs_kmod_fini(void) kmem_free(zs, sizeof (zfsdev_state_t)); } + zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f956a9ef7..8a8fbccd7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. @@ -547,7 +547,7 @@ error: if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, &zio->io_bookmark, zio, 0, 0); + spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; @@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, - pio->io_spa, vd, zb, pio, 0, 0); + pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { @@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, - NULL, NULL, 0, 0); + NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); @@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio) zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - - zfs_ereport_start_checksum(zio->io_spa, + int ret = zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); + + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + } } } @@ -4543,7 +4545,7 @@ zio_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0, 0); + zio, 0); } } } @@ -4557,16 +4559,16 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - if (zio->io_type == ZIO_TYPE_READ) { - zio->io_vd->vdev_stat.vs_read_errors++; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - zio->io_vd->vdev_stat.vs_write_errors++; + int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vd->vdev_stat.vs_read_errors++; + else if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_vd->vdev_stat.vs_write_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); } - mutex_exit(&zio->io_vd->vdev_stat_lock); - - (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if ((zio->io_error == EIO || !(zio->io_flags & @@ -4578,7 +4580,7 @@ zio_done(zio_t *zio) */ spa_log_error(zio->io_spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); + zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } |