diff options
author | Tony Hutter <[email protected]> | 2018-11-08 16:47:24 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-11-08 16:47:24 -0800 |
commit | ad796b8a3b2565bcd9c7460b7bf9154e4850ca93 (patch) | |
tree | 645cc21be6d49c034f00273276caa82ce59702a7 | |
parent | 877d925a9e816337bb62ee61d564118db0181477 (diff) |
Add zpool status -s (slow I/Os) and -p (parseable)
This patch adds a new slow I/Os (-s) column to zpool status to show the
number of VDEV slow I/Os. This is the number of I/Os that didn't
complete in zio_slow_io_ms milliseconds. It also adds a new parsable
(-p) flag to display exact values.
NAME STATE READ WRITE CKSUM SLOW
testpool ONLINE 0 0 0 -
mirror-0 ONLINE 0 0 0 -
loop0 ONLINE 0 0 0 20
loop1 ONLINE 0 0 0 0
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Signed-off-by: Tony Hutter <[email protected]>
Closes #7756
Closes #6885
-rw-r--r-- | cmd/zpool/zpool_main.c | 53 | ||||
-rw-r--r-- | include/sys/fm/fs/zfs.h | 1 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 4 | ||||
-rw-r--r-- | include/sys/spa.h | 4 | ||||
-rw-r--r-- | include/sys/zio.h | 7 | ||||
-rw-r--r-- | man/man5/zfs-events.5 | 12 | ||||
-rw-r--r-- | man/man5/zfs-module-parameters.5 | 21 | ||||
-rw-r--r-- | man/man8/zpool.8 | 12 | ||||
-rw-r--r-- | module/zfs/vdev.c | 23 | ||||
-rw-r--r-- | module/zfs/vdev_label.c | 3 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 199 | ||||
-rw-r--r-- | module/zfs/zio.c | 34 | ||||
-rw-r--r-- | tests/runfiles/linux.run | 2 | ||||
-rw-r--r-- | tests/zfs-tests/tests/functional/fault/Makefile.am | 3 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh | 77 | ||||
-rw-r--r-- | tests/zfs-tests/tests/perf/perf.shlib | 2 |
16 files changed, 321 insertions, 136 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 67ec23d47..0b820a39a 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -365,8 +365,8 @@ get_usage(zpool_help_t idx) case HELP_RESILVER: return (gettext("\tresilver <pool> ...\n")); case HELP_STATUS: - return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]" - "[-T d|u] [pool] ... \n" + return (gettext("\tstatus [-c [script1,script2,...]] " + "[-gLpPsvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" @@ -1669,10 +1669,12 @@ typedef struct status_cbdata { int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; + boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; + boolean_t cb_print_slow_ios; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; @@ -1788,10 +1790,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, name, state); if (!isspare) { - zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); - zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); - zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + if (cb->cb_literal) { + printf(" %5llu %5llu %5llu", + (u_longlong_t)vs->vs_read_errors, + (u_longlong_t)vs->vs_write_errors, + (u_longlong_t)vs->vs_checksum_errors); + } else { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, + sizeof (cbuf)); + printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + } + + if (cb->cb_print_slow_ios) { + if (children == 0) { + /* Only leafs vdevs have slow IOs */ + zfs_nicenum(vs->vs_slow_ios, rbuf, + sizeof (rbuf)); + } else { + snprintf(rbuf, sizeof (rbuf), "-"); + } + + if (cb->cb_literal) + printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + else + printf(" %5s", rbuf); + } + } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, @@ -7175,6 +7201,9 @@ status_callback(zpool_handle_t *zhp, void *data) cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); + if (cbp->cb_print_slow_ios) + (void) printf(" %5s", gettext("SLOW")); + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -7241,13 +7270,15 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-gLPvx] [-T d|u] [pool] ... + * zpool status [-c [script1,script2,...]] [-gLpPsvx] [-T d|u] [pool] ... * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. + * -p Display values in parsable (exact) format. * -P Display full path for vdev name. + * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) @@ -7266,7 +7297,7 @@ zpool_do_status(int argc, char **argv) char *cmd = NULL; /* check options */ - while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) { + while ((c = getopt(argc, argv, "c:gLpPsvxDT:")) != -1) { switch (c) { case 'c': if (cmd != NULL) { @@ -7298,9 +7329,15 @@ zpool_do_status(int argc, char **argv) case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; + case 'p': + cb.cb_literal = B_TRUE; + break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; + case 's': + cb.cb_print_slow_ios = B_TRUE; + break; case 'v': cb.cb_verbose = B_TRUE; break; diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 513d0cf5e..9bfb123c7 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -74,6 +74,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4f0e13dfd..05b7685f5 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -665,6 +665,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" +/* Number of slow IOs */ +#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" + /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" @@ -990,6 +993,7 @@ typedef struct vdev_stat { uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ + uint64_t vs_slow_ios; /* slow IOs */ } vdev_stat_t; /* diff --git a/include/sys/spa.h b/include/sys/spa.h index ca657ad70..5dc27e334 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1076,9 +1076,11 @@ extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); -extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t length); +extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd, + zio_t *zio); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); diff --git a/include/sys/zio.h b/include/sys/zio.h index 322006649..4b7ad3e22 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -158,11 +158,6 @@ enum zio_encrypt { (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) -/* - * Default Linux timeout for a sd device. - */ -#define ZIO_DELAY_MAX (30 * MILLISEC) - #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 @@ -664,7 +659,7 @@ extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ -extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); diff --git a/man/man5/zfs-events.5 b/man/man5/zfs-events.5 index fc2a391f8..7e9bbedaf 100644 --- a/man/man5/zfs-events.5 +++ b/man/man5/zfs-events.5 @@ -96,8 +96,9 @@ information regarding "hung" I/O detection and configuration. .ad .RS 12n Issued when a completed I/O exceeds the maximum allowed time specified -by the \fBzio_delay_max\fR module option. This can be an indicator of -problems with the underlying storage device. +by the \fBzio_slow_io_ms\fR module option. This can be an indicator of +problems with the underlying storage device. The number of delay events is +ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter. .RE .sp @@ -697,10 +698,9 @@ full list of all the I/O stages. \fBzio_delay\fR .ad .RS 12n -The time in ticks (HZ) required for the block layer to service the I/O. Unlike -\fBzio_delta\fR this does not include any vdev queuing time and is therefore -solely a measure of the block layer performance. On most modern Linux systems -HZ is defined as 1000 making a tick equivalent to 1 millisecond. +The time elapsed (in nanoseconds) waiting for the block layer to complete the +I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is +therefore solely a measure of the block layer performance. .RE .sp diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index d7b53d161..229484afc 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1086,7 +1086,7 @@ Default value: \fB600,000\fR. .ad .RS 12n Interval in milliseconds after which the deadman is triggered and an -individual IO operation is considered to be "hung". As long as the I/O +individual I/O operation is considered to be "hung". As long as the I/O remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR milliseconds until the I/O completes. .sp @@ -1141,10 +1141,10 @@ Default value: \fB500,000\fR. .sp .ne 2 .na -\fBzfs_delays_per_second\fR (int) +\fBzfs_low_ios_per_second\fR (int) .ad .RS 12n -Rate limit IO delay events to this many per second. +Rate limit delay zevents (which report slow I/Os) to this many per second. .sp Default value: 20 .RE @@ -1655,7 +1655,7 @@ Default value: \fB50\fR. .ad .RS 12n We currently support block sizes from 512 bytes to 16MB. The benefits of -larger blocks, and thus larger IO, need to be weighed against the cost of +larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on i/o latency, and also potentially on the memory allocator. Therefore, we do not allow the recordsize to be set larger than @@ -2552,12 +2552,13 @@ Default value: \fB0\fR. .sp .ne 2 .na -\fBzio_delay_max\fR (int) +\fBzio_slow_io_ms\fR (int) .ad .RS 12n -A zevent will be logged if a ZIO operation takes more than N milliseconds to -complete. Note that this is only a logging facility, not a timeout on -operations. +When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to +complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow +I/O counters can be seen with "zpool status -s". + .sp Default value: \fB30,000\fR. .RE @@ -2568,7 +2569,7 @@ Default value: \fB30,000\fR. \fBzio_dva_throttle_enabled\fR (int) .ad .RS 12n -Throttle block allocations in the ZIO pipeline. This allows for +Throttle block allocations in the I/O pipeline. This allows for dynamic allocation distribution when devices are imbalanced. When enabled, the maximum number of pending allocations per top-level vdev is limited by \fBzfs_vdev_queue_depth_pct\fR. @@ -2594,7 +2595,7 @@ Default value: \fB0\fR. .ad .RS 12n Percentage of online CPUs (or CPU cores, etc) which will run a worker thread -for IO. These workers are responsible for IO work such as compression and +for I/O. These workers are responsible for I/O work such as compression and checksum calculations. Fractional number of CPUs will be rounded down. .sp The default value of 75 was chosen to avoid using all CPUs which can result in diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 8fe6b494b..b9e0e1ad4 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -182,7 +182,7 @@ .Nm .Cm status .Oo Fl c Ar SCRIPT Oc -.Op Fl gLPvxD +.Op Fl DgLpPsvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count @@ -2167,7 +2167,7 @@ and automatically import it. .Nm .Cm status .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... -.Op Fl gLPvxD +.Op Fl DgLpPsvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count @@ -2203,6 +2203,8 @@ Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. +.It Fl p +Display numbers in parsable (exact) values. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the @@ -2214,6 +2216,12 @@ Display a histogram of deduplication statistics, showing the allocated and referenced .Pq logically referenced in the pool block counts and sizes by reference count. +.It Fl s +Display the number of leaf VDEV slow IOs. This is the number of IOs that +didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds). +This does not necessarily mean the IOs failed to complete, just took an +unreasonably long amount of time. This may indicate a problem with the +underlying storage. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 78e701c80..a99eb93a4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE; int vdev_dtl_sm_blksz = (1 << 12); /* - * Rate limit delay events to this many IO delays per second. + * Rate limit slow IO (delay) events to this many per second. */ -unsigned int zfs_delays_per_second = 20; +unsigned int zfs_slow_io_events_per_second = 20; /* * Rate limit checksum events after this many checksum errors per second. */ -unsigned int zfs_checksums_per_second = 20; +unsigned int zfs_checksum_events_per_second = 20; /* * Ignore errors during scrub/resilver. Allows to work around resilver @@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ - zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1); - zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1); + zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, + 1); + zfs_ratelimit_init(&vd->vdev_checksum_rl, + &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); @@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644); MODULE_PARM_DESC(vdev_ms_count_limit, "Practical upper limit of total metaslabs per top-level vdev"); -module_param(zfs_delays_per_second, uint, 0644); -MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many " - "IO delays per second"); +module_param(zfs_slow_io_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_slow_io_events_per_second, + "Rate limit slow IO (delay) events to this many per second"); -module_param(zfs_checksums_per_second, uint, 0644); - MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events " +module_param(zfs_checksum_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events " "to this many checksum errors per second (do not set below zed" "threshold)."); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7e86e3a8b..b3425cf26 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); + /* IO delays */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index e604f33c8..579aa0380 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) return (rc); } -static void +/* + * Return B_TRUE if the event actually posted, B_FALSE if not. + */ +static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) @@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - /* - * If we are doing a spa_tryimport() or in recovery mode, - * ignore errors. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - if (vd != NULL) { - /* - * If the vdev has already been marked as failing due - * to a failed probe, then ignore any subsequent I/O - * errors, as the DE will automatically fault the vdev - * on the first such failure. This also catches cases - * where vdev_remove_wanted is set and the device has - * not yet been asynchronously placed into the REMOVED - * state. - */ - if (zio->io_vd == vd && !vdev_accessible(vd, zio)) - return; - - /* - * Ignore checksum errors for reads from DTL regions of - * leaf vdevs. - */ - if (zio->io_type == ZIO_TYPE_READ && - zio->io_error == ECKSUM && - vd->vdev_ops->vdev_op_leaf && - vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) - return; - } - } - - /* - * For probe failure, we want to avoid posting ereports if we've - * already removed the device in the meantime. - */ - if (vd != NULL && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && - (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) - return; - - if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && - (zio != NULL) && (!zio->io_timestamp)) { - /* Ignore bogus delay events */ - return; - } + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (B_FALSE); if ((ereport = fm_nvlist_create(NULL)) == NULL) - return; + return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); - return; + return (B_FALSE); } /* @@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, - DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); + DATA_TYPE_UINT64, vs->vs_checksum_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, + DATA_TYPE_UINT64, vs->vs_slow_ios, + NULL); } if (pvd != NULL) { @@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, /* * Payload for I/Os with corresponding logical information. */ - if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, @@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); + } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; + return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ @@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } #endif -void +/* + * Make sure our event is still valid for the given zio/vdev/pool. For example, + * we don't want to keep logging events for a faulted or missing vdev. + */ +boolean_t +zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) +{ +#ifdef _KERNEL + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return (B_FALSE); + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return (B_FALSE); + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return (B_FALSE); + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return (B_FALSE); + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return (B_FALSE); + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return (B_FALSE); + + /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + (zio != NULL) && (!zio->io_timestamp)) { + return (B_FALSE); + } +#endif + return (B_TRUE); +} + +/* + * Return 0 if event was posted, EINVAL if there was a problem posting it or + * EBUSY if the event was rate limited. + */ +int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (zfs_is_ratelimiting_event(subclass, vd)) - return; + return (SET_ERROR(EBUSY)); - zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size); + if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size)) + return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) - return; + return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif + return (rc); } void @@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, { zio_cksum_report_t *report; - #ifdef _KERNEL if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return; @@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) } -void +int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; - zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, - spa, vd, zb, zio, offset, length); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return (EBUSY); - if (ereport == NULL) - return; + if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length) || (ereport == NULL)) { + return (SET_ERROR(EINVAL)); + } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #endif + return (rc); } /* @@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #if defined(_KERNEL) EXPORT_SYMBOL(zfs_ereport_post); +EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e8c2ca89a..6f1aa640d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -77,7 +77,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif -int zio_delay_max = ZIO_DELAY_MAX; +/* Mark IOs as "slow" if they take longer than 30 seconds */ +int zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) @@ -4431,10 +4432,28 @@ zio_done(zio_t *zio) * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ - if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { - if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); + if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { + /* + * We want to only increment our slow IO counters if + * the IO is valid (i.e. not if the drive is removed). + * + * zfs_ereport_post() will also do these checks, but + * it can also ratelimit and have other failures, so we + * need to increment the slow_io counters independent + * of it. + */ + if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_slow_ios++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0, 0); + } + } } if (zio->io_error) { @@ -4823,8 +4842,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); -module_param(zio_delay_max, int, 0644); -MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); +module_param(zio_slow_io_ms, int, 0644); +MODULE_PARM_DESC(zio_slow_io_ms, + "Max I/O completion time (milliseconds) before marking it as slow"); module_param(zio_requeue_io_start_cut_in_line, int, 0644); MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4f556acde..e52ab9078 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -546,7 +546,7 @@ tags = ['functional', 'exec'] tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple', 'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault', - 'decompress_fault'] + 'decompress_fault','zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/async_destroy] diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index ec07db57d..5c68ea26f 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -11,7 +11,8 @@ dist_pkgdata_SCRIPTS = \ auto_spare_shared.ksh \ decrypt_fault.ksh \ decompress_fault.ksh \ - scrub_after_resilver.ksh + scrub_after_resilver.ksh \ + zpool_status_-s.ksh dist_pkgdata_DATA = \ fault.cfg diff --git a/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh new file mode 100755 index 000000000..b6a3e71fd --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify zpool status -s (slow IOs) works +# +# STRATEGY: +# 1. Create a file +# 2. Inject slow IOs into the pool +# 3. Verify we can see the slow IOs with "zpool status -s". +# 4. Verify we can see delay events. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/zpool_script.shlib + +DISK=${DISKS%% *} + +verify_runnable "both" + +log_must zpool create $TESTPOOL mirror ${DISKS} + +function cleanup +{ + log_must zinject -c all + log_must set_tunable64 zio_slow_io_ms $OLD_SLOW_IO + log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS + log_must destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool events -c + +# Mark any IOs greater than 10ms as slow IOs +OLD_SLOW_IO=$(get_tunable zio_slow_io_ms) +OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second) +log_must set_tunable64 zio_slow_io_ms 10 +log_must set_tunable64 zfs_slow_io_events_per_second 1000 + +# Create 20ms IOs +log_must zinject -d $DISK -D20:100 $TESTPOOL +log_must mkfile 1048576 /$TESTPOOL/testfile +log_must zpool sync $TESTPOOL + +log_must zinject -c all +SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}') +DELAY_EVENTS=$(zpool events | grep delay | wc -l) + +if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then + log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events" +else + log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events" +fi diff --git a/tests/zfs-tests/tests/perf/perf.shlib b/tests/zfs-tests/tests/perf/perf.shlib index 3802c6aec..7165df759 100644 --- a/tests/zfs-tests/tests/perf/perf.shlib +++ b/tests/zfs-tests/tests/perf/perf.shlib @@ -410,7 +410,7 @@ function get_system_config zfs_vdev_async_write_max_active \ zfs_vdev_sync_read_max_active \ zfs_vdev_sync_write_max_active \ - zio_delay_max + zio_slow_io_ms do if [ "$tunable" != "zfs_arc_max" ] then |