diff options
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/arc.c | 8 | ||||
-rw-r--r-- | module/zfs/fm.c | 14 | ||||
-rw-r--r-- | module/zfs/spa.c | 4 | ||||
-rw-r--r-- | module/zfs/spa_config.c | 4 | ||||
-rw-r--r-- | module/zfs/vdev.c | 7 | ||||
-rw-r--r-- | module/zfs/vdev_indirect.c | 13 | ||||
-rw-r--r-- | module/zfs/vdev_raidz.c | 35 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 361 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 1 | ||||
-rw-r--r-- | module/zfs/zio.c | 42 |
10 files changed, 418 insertions, 71 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 0c33a4535..12837104a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K <[email protected]>. All rights reserved. @@ -2188,7 +2188,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, ret = SET_ERROR(EIO); spa_log_error(spa, zb); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } return (ret); @@ -5654,7 +5654,7 @@ arc_read_done(zio_t *zio) spa_log_error(zio->io_spa, &acb->acb_zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } @@ -5931,7 +5931,7 @@ top: spa_log_error(spa, zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } } if (rc != 0) { diff --git a/module/zfs/fm.c b/module/zfs/fm.c index c00e08b8d..a5003f85d 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -104,13 +104,15 @@ struct erpt_kstat { kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ + kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, - { "payload-set-failed", KSTAT_DATA_UINT64 } + { "payload-set-failed", KSTAT_DATA_UINT64 }, + { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; @@ -568,6 +570,12 @@ out: return (error); } +void +zfs_zevent_track_duplicate(void) +{ + atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); +} + static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { @@ -1633,6 +1641,8 @@ fm_init(void) list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); + + zfs_ereport_init(); } void @@ -1640,6 +1650,8 @@ fm_fini(void) { int count; + zfs_ereport_fini(); + zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 31fa52d1d..015996d15 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -2868,7 +2868,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, - NULL, NULL, NULL, 0, 0); + NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index b98b7badb..81059c69d 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ @@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, NULL, 0, 0); + target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 95a2f5947..0a3b8bd83 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -1862,11 +1862,10 @@ vdev_open(vdev_t *vd) vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); - } vd->vdev_max_asize = max_asize; } @@ -4759,7 +4758,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, - save_state, 0); + save_state); } /* Erase any notion of persistent removed state */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 5301e0665..12ee393bd 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -16,7 +16,7 @@ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K <[email protected]>. All rights reserved. - * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1473,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vdev_t *vd = ic->ic_vdev; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 4320078b6..47312e02f 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } @@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio) vdev_t *cvd; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; + if (rc->rc_error != 0) + continue; + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum( + zio->io_spa, cvd, &zio->io_bookmark, zio, + rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); - - zfs_ereport_start_checksum( - zio->io_spa, cvd, - &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); } } } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index ad13ccedf..a8341f50b 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012,2020 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -101,7 +101,251 @@ * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ + #ifdef _KERNEL +/* + * Duplicate ereport Detection + * + * Some ereports are retained momentarily for detecting duplicates. These + * are kept in a recent_events_node_t in both a time-ordered list and an AVL + * tree of recent unique ereports. + * + * The lifespan of these recent ereports is bounded (15 mins) and a cleaner + * task is used to purge stale entries. + */ +static list_t recent_events_list; +static avl_tree_t recent_events_tree; +static kmutex_t recent_events_lock; +static taskqid_t recent_events_cleaner_tqid; + +/* + * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. + * + * This setting can be changed dynamically and setting it to zero + * disables duplicate detection. + */ +unsigned int zfs_zevent_retain_max = 2000; + +/* + * The lifespan for a recent ereport entry. The default of 15 minutes is + * intended to outlive the zfs diagnosis engine's threshold of 10 errors + * over a period of 10 minutes. + */ +unsigned int zfs_zevent_retain_expire_secs = 900; + +typedef enum zfs_subclass { + ZSC_IO, + ZSC_DATA, + ZSC_CHECKSUM +} zfs_subclass_t; + +typedef struct { + /* common criteria */ + uint64_t re_pool_guid; + uint64_t re_vdev_guid; + int re_io_error; + uint64_t re_io_size; + uint64_t re_io_offset; + zfs_subclass_t re_subclass; + zio_priority_t re_io_priority; + + /* logical zio criteria (optional) */ + zbookmark_phys_t re_io_bookmark; + + /* internal state */ + avl_node_t re_tree_link; + list_node_t re_list_link; + uint64_t re_timestamp; +} recent_events_node_t; + +static int +recent_events_compare(const void *a, const void *b) +{ + const recent_events_node_t *node1 = a; + const recent_events_node_t *node2 = b; + int cmp; + + /* + * The comparison order here is somewhat arbitrary. + * What's important is that if every criteria matches, then it + * is a duplicate (i.e. compare returns 0) + */ + if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) + return (cmp); + + const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; + const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; + + if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) + return (cmp); + + return (0); +} + +static void zfs_ereport_schedule_cleaner(void); + +/* + * background task to clean stale recent event nodes. + */ +/*ARGSUSED*/ +static void +zfs_ereport_cleaner(void *arg) +{ + recent_events_node_t *entry; + uint64_t now = gethrtime(); + + /* + * purge expired entries + */ + mutex_enter(&recent_events_lock); + while ((entry = list_tail(&recent_events_list)) != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + if (age <= zfs_zevent_retain_expire_secs) + break; + + /* remove expired node */ + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + + /* Restart the cleaner if more entries remain */ + recent_events_cleaner_tqid = 0; + if (!list_is_empty(&recent_events_list)) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); +} + +static void +zfs_ereport_schedule_cleaner(void) +{ + ASSERT(MUTEX_HELD(&recent_events_lock)); + + uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); + + recent_events_cleaner_tqid = taskq_dispatch_delay( + system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, + ddi_get_lbolt() + NSEC_TO_TICK(timeout)); +} + +/* + * Check if an ereport would be a duplicate of one recently posted. + * + * An ereport is considered a duplicate if the set of criteria in + * recent_events_node_t all match. + * + * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM + * are candidates for duplicate checking. + */ +static boolean_t +zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) +{ + recent_events_node_t search = {0}, *entry; + + if (vd == NULL || zio == NULL) + return (B_FALSE); + + if (zfs_zevent_retain_max == 0) + return (B_FALSE); + + if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) + search.re_subclass = ZSC_IO; + else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) + search.re_subclass = ZSC_DATA; + else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) + search.re_subclass = ZSC_CHECKSUM; + else + return (B_FALSE); + + search.re_pool_guid = spa_guid(spa); + search.re_vdev_guid = vd->vdev_guid; + search.re_io_error = zio->io_error; + search.re_io_priority = zio->io_priority; + /* if size is supplied use it over what's in zio */ + if (size) { + search.re_io_size = size; + search.re_io_offset = offset; + } else { + search.re_io_size = zio->io_size; + search.re_io_offset = zio->io_offset; + } + + /* grab optional logical zio criteria */ + if (zb != NULL) { + search.re_io_bookmark.zb_objset = zb->zb_objset; + search.re_io_bookmark.zb_object = zb->zb_object; + search.re_io_bookmark.zb_level = zb->zb_level; + search.re_io_bookmark.zb_blkid = zb->zb_blkid; + } + + uint64_t now = gethrtime(); + + mutex_enter(&recent_events_lock); + + /* check if we have seen this one recently */ + entry = avl_find(&recent_events_tree, &search, NULL); + if (entry != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + + /* + * There is still an active cleaner (since we're here). + * Reset the last seen time for this duplicate entry + * so that its lifespand gets extended. + */ + list_remove(&recent_events_list, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + zfs_zevent_track_duplicate(); + mutex_exit(&recent_events_lock); + + return (age <= zfs_zevent_retain_expire_secs); + } + + if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { + /* recycle oldest node */ + entry = list_tail(&recent_events_list); + ASSERT(entry != NULL); + list_remove(&recent_events_list, entry); + avl_remove(&recent_events_tree, entry); + } else { + entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); + } + + /* record this as a recent ereport */ + *entry = search; + avl_add(&recent_events_tree, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + /* Start a cleaner if not already scheduled */ + if (recent_events_cleaner_tqid == 0) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); + return (B_FALSE); +} + void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { @@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) - return (B_FALSE); - if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); @@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a @@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) } /* - * Return 0 if event was posted, EINVAL if there was a problem posting it or - * EBUSY if the event was rate limited. + * Post an ereport for the given subclass + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t size) + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (EINVAL); + + if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size)) + zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) @@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, return (rc); } -void +/* + * Prepare a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ +int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) @@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_cksum_report_t *report; #ifdef _KERNEL + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return; + return (SET_ERROR(EBUSY)); #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); @@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); - return; + return (0); } #endif @@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); + return (0); } void @@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) kmem_free(rpt, sizeof (*rpt)); } - +/* + * Post a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, @@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, nvlist_t *detector = NULL; zfs_ecksum_info_t *info; + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return (EBUSY); + return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { @@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #endif } -#if defined(_KERNEL) +#ifdef _KERNEL +void +zfs_ereport_init(void) +{ + mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&recent_events_list, sizeof (recent_events_node_t), + offsetof(recent_events_node_t, re_list_link)); + avl_create(&recent_events_tree, recent_events_compare, + sizeof (recent_events_node_t), offsetof(recent_events_node_t, + re_tree_link)); +} + +/* + * This 'early' fini needs to run before zfs_fini() which on Linux waits + * for the system_delay_taskq to drain. + */ +void +zfs_ereport_taskq_fini(void) +{ + mutex_enter(&recent_events_lock); + if (recent_events_cleaner_tqid != 0) { + taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); + recent_events_cleaner_tqid = 0; + } + mutex_exit(&recent_events_lock); +} + +void +zfs_ereport_fini(void) +{ + recent_events_node_t *entry; + + while ((entry = list_head(&recent_events_list)) != NULL) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + avl_destroy(&recent_events_tree); + list_destroy(&recent_events_list); + mutex_destroy(&recent_events_lock); +} + EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, + "Maximum recent zevents records to retain for duplicate checking"); +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, + "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 495ff4707..c9322a826 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7615,6 +7615,7 @@ zfs_kmod_fini(void) kmem_free(zs, sizeof (zfsdev_state_t)); } + zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f956a9ef7..8a8fbccd7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. @@ -547,7 +547,7 @@ error: if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, &zio->io_bookmark, zio, 0, 0); + spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; @@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, - pio->io_spa, vd, zb, pio, 0, 0); + pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { @@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, - NULL, NULL, 0, 0); + NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); @@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio) zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - - zfs_ereport_start_checksum(zio->io_spa, + int ret = zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); + + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + } } } @@ -4543,7 +4545,7 @@ zio_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0, 0); + zio, 0); } } } @@ -4557,16 +4559,16 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - if (zio->io_type == ZIO_TYPE_READ) { - zio->io_vd->vdev_stat.vs_read_errors++; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - zio->io_vd->vdev_stat.vs_write_errors++; + int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vd->vdev_stat.vs_read_errors++; + else if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_vd->vdev_stat.vs_write_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); } - mutex_exit(&zio->io_vd->vdev_stat_lock); - - (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if ((zio->io_error == EIO || !(zio->io_flags & @@ -4578,7 +4580,7 @@ zio_done(zio_t *zio) */ spa_log_error(zio->io_spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); + zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } |