aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/ztest/ztest.c78
-rw-r--r--configure.ac1
-rw-r--r--include/sys/fm/fs/zfs.h1
-rw-r--r--include/sys/spa.h9
-rw-r--r--include/sys/spa_impl.h4
-rw-r--r--include/sys/vdev.h2
-rw-r--r--include/sys/zfs_context.h1
-rw-r--r--include/sys/zio.h1
-rw-r--r--man/man1/ztest.14
-rw-r--r--man/man5/zfs-events.519
-rw-r--r--man/man5/zfs-module-parameters.564
-rw-r--r--module/zfs/spa_misc.c94
-rw-r--r--module/zfs/vdev.c19
-rw-r--r--module/zfs/zio.c90
-rwxr-xr-xscripts/zloop.sh9
-rw-r--r--tests/runfiles/linux.run6
-rw-r--r--tests/zfs-tests/tests/functional/Makefile.am1
-rw-r--r--tests/zfs-tests/tests/functional/deadman/Makefile.am5
-rw-r--r--tests/zfs-tests/tests/functional/deadman/deadman.cfg32
-rwxr-xr-xtests/zfs-tests/tests/functional/deadman/deadman_sync.ksh86
-rwxr-xr-xtests/zfs-tests/tests/functional/deadman/deadman_zio.ksh113
21 files changed, 582 insertions, 57 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index f07e11c83..a8b5418b6 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -197,6 +197,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
+extern unsigned long zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern int zfs_abd_scatter_enabled;
@@ -447,6 +448,7 @@ static kmutex_t ztest_vdev_lock;
static rwlock_t ztest_name_lock;
static boolean_t ztest_dump_core = B_TRUE;
+static boolean_t ztest_dump_debug_buffer = B_FALSE;
static boolean_t ztest_exiting;
/* Global commit callback list */
@@ -495,6 +497,16 @@ _umem_logging_init(void)
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
+static void
+dump_debug_buffer(void)
+{
+ if (!ztest_dump_debug_buffer)
+ return;
+
+ (void) printf("\n");
+ zfs_dbgmsg_print("ztest");
+}
+
#define BACKTRACE_SZ 100
static void sig_handler(int signo)
@@ -507,6 +519,7 @@ static void sig_handler(int signo)
nptrs = backtrace(buffer, BACKTRACE_SZ);
backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
#endif
+ dump_debug_buffer();
/*
* Restore default action and re-raise signal so SIGSEGV and
@@ -544,6 +557,9 @@ fatal(int do_perror, char *message, ...)
}
(void) fprintf(stderr, "%s\n", buf);
fatal_msg = buf; /* to ease debugging */
+
+ dump_debug_buffer();
+
if (ztest_dump_core)
abort();
exit(3);
@@ -641,6 +657,7 @@ usage(boolean_t requested)
"\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
"\t[-o variable=value] ... set global variable to an unsigned\n"
"\t 32-bit integer value\n"
+ "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n"
"\t[-h] (print help)\n"
"",
zo->zo_pool,
@@ -676,7 +693,7 @@ process_options(int argc, char **argv)
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
while ((opt = getopt(argc, argv,
- "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) {
+ "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:G")) != EOF) {
value = 0;
switch (opt) {
case 'v':
@@ -771,6 +788,9 @@ process_options(int argc, char **argv)
if (set_global_var(optarg) != 0)
usage(B_FALSE);
break;
+ case 'G':
+ ztest_dump_debug_buffer = B_TRUE;
+ break;
case 'h':
usage(B_TRUE);
break;
@@ -6224,15 +6244,48 @@ ztest_resume_thread(void *arg)
thread_exit();
}
-#define GRACE 300
-
-#if 0
static void
-ztest_deadman_alarm(int sig)
+ztest_deadman_thread(void *arg)
{
- fatal(0, "failed to complete within %d seconds of deadline", GRACE);
+ ztest_shared_t *zs = arg;
+ spa_t *spa = ztest_spa;
+ hrtime_t delta, overdue, total = 0;
+
+ for (;;) {
+ delta = zs->zs_thread_stop - zs->zs_thread_start +
+ MSEC2NSEC(zfs_deadman_synctime_ms);
+
+ (void) poll(NULL, 0, (int)NSEC2MSEC(delta));
+
+ /*
+ * If the pool is suspended then fail immediately. Otherwise,
+ * check to see if the pool is making any progress. If
+ * vdev_deadman() discovers that there hasn't been any recent
+ * I/Os then it will end up aborting the tests.
+ */
+ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
+ fatal(0, "aborting test after %llu seconds because "
+ "pool has transitioned to a suspended state.",
+ zfs_deadman_synctime_ms / 1000);
+ }
+ vdev_deadman(spa->spa_root_vdev, FTAG);
+
+ /*
+ * If the process doesn't complete within a grace period of
+ * zfs_deadman_synctime_ms over the expected finish time,
+ * then it may be hung and is terminated.
+ */
+ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
+ total += zfs_deadman_synctime_ms / 1000;
+ if (gethrtime() > overdue) {
+ fatal(0, "aborting test after %llu seconds because "
+ "the process is overdue for termination.", total);
+ }
+
+ (void) printf("ztest has been running for %lld seconds\n",
+ total);
+ }
}
-#endif
static void
ztest_execute(int test, ztest_info_t *zi, uint64_t id)
@@ -6491,13 +6544,13 @@ ztest_run(ztest_shared_t *zs)
resume_thread = thread_create(NULL, 0, ztest_resume_thread,
spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
-#if 0
/*
- * Set a deadman alarm to abort() if we hang.
+ * Create a deadman thread and set to panic if we hang.
*/
- signal(SIGALRM, ztest_deadman_alarm);
- alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE);
-#endif
+ (void) thread_create(NULL, 0, ztest_deadman_thread,
+ zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
/*
* Verify that we can safely inquire about about any object,
@@ -7047,6 +7100,7 @@ main(int argc, char **argv)
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
+ zfs_deadman_synctime_ms = 300000;
action.sa_handler = sig_handler;
sigemptyset(&action.sa_mask);
diff --git a/configure.ac b/configure.ac
index 47e0a1b12..fe7093e23 100644
--- a/configure.ac
+++ b/configure.ac
@@ -242,6 +242,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile
tests/zfs-tests/tests/functional/compression/Makefile
tests/zfs-tests/tests/functional/ctime/Makefile
+ tests/zfs-tests/tests/functional/deadman/Makefile
tests/zfs-tests/tests/functional/delegate/Makefile
tests/zfs-tests/tests/functional/devices/Makefile
tests/zfs-tests/tests/functional/events/Makefile
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index 02b15b810..513d0cf5e 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -37,6 +37,7 @@ extern "C" {
#define FM_EREPORT_ZFS_IO "io"
#define FM_EREPORT_ZFS_DATA "data"
#define FM_EREPORT_ZFS_DELAY "delay"
+#define FM_EREPORT_ZFS_DEADMAN "deadman"
#define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
diff --git a/include/sys/spa.h b/include/sys/spa.h
index c291c5082..7bc3b2197 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -955,12 +955,15 @@ extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
extern void spa_evicting_os_wait(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_prev_software_version(spa_t *spa);
-extern uint8_t spa_get_failmode(spa_t *spa);
+extern uint64_t spa_get_failmode(spa_t *spa);
+extern uint64_t spa_get_deadman_failmode(spa_t *spa);
+extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
+extern uint64_t spa_deadman_ziotime(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
@@ -1074,6 +1077,10 @@ extern boolean_t spa_debug_enabled(spa_t *spa);
}
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
+extern int zfs_deadman_enabled;
+extern unsigned long zfs_deadman_synctime_ms;
+extern unsigned long zfs_deadman_ziotime_ms;
+extern unsigned long zfs_deadman_checktime_ms;
#ifdef __cplusplus
}
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 2fc598016..66fcafd9c 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -228,6 +228,7 @@ struct spa {
uint64_t spa_pool_props_object; /* object for properties */
uint64_t spa_bootfs; /* default boot filesystem */
uint64_t spa_failmode; /* failure mode for the pool */
+ uint64_t spa_deadman_failmode; /* failure mode for deadman */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
/* per-CPU array of root of async I/O: */
@@ -270,7 +271,8 @@ struct spa {
taskqid_t spa_deadman_tqid; /* Task id */
uint64_t spa_deadman_calls; /* number of deadman calls */
hrtime_t spa_sync_starttime; /* starting time of spa_sync */
- uint64_t spa_deadman_synctime; /* deadman expiration timer */
+ uint64_t spa_deadman_synctime; /* deadman sync expiration */
+ uint64_t spa_deadman_ziotime; /* deadman zio expiration */
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
uint64_t spa_errata; /* errata issues detected */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index bc2f4f0ea..022713096 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -84,7 +84,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
-extern void vdev_deadman(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd, char *tag);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 6f32b11c7..d6da0d32b 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -317,6 +317,7 @@ extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
extern void cv_signal(kcondvar_t *cv);
extern void cv_broadcast(kcondvar_t *cv);
+#define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at)
#define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at)
#define cv_wait_sig(cv, mp) cv_wait(cv, mp)
#define cv_wait_io(cv, mp) cv_wait(cv, mp)
diff --git a/include/sys/zio.h b/include/sys/zio.h
index bf4818e16..1ed1f1ffb 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -556,6 +556,7 @@ extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
+extern void zio_deadman(zio_t *zio, char *tag);
extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
diff --git a/man/man1/ztest.1 b/man/man1/ztest.1
index 64f543d21..b8cb0d45d 100644
--- a/man/man1/ztest.1
+++ b/man/man1/ztest.1
@@ -129,6 +129,10 @@ Total test run time.
.BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs)
.IP
Injected failure rate.
+.HP
+.BI "\-G"
+.IP
+Dump zfs_dbgmsg buffer before exiting.
.SH "EXAMPLES"
.LP
To override /tmp as your location for block files, you can use the -f
diff --git a/man/man5/zfs-events.5 b/man/man5/zfs-events.5
index 5cef4f539..4c60eecc5 100644
--- a/man/man5/zfs-events.5
+++ b/man/man5/zfs-events.5
@@ -55,7 +55,7 @@ part here.
\fBchecksum\fR
.ad
.RS 12n
-Issued when a checksum error have been detected.
+Issued when a checksum error has been detected.
.RE
.sp
@@ -79,11 +79,24 @@ Issued when there have been data errors in the pool.
.sp
.ne 2
.na
+\fBdeadman\fR
+.ad
+.RS 12n
+Issued when an I/O is determined to be "hung", this can be caused by lost
+completion events due to flaky hardware or drivers. See the
+\fBzfs_deadman_failmode\fR module option description for additional
+information regarding "hung" I/O detection and configuration.
+.RE
+
+.sp
+.ne 2
+.na
\fBdelay\fR
.ad
.RS 12n
-Issued when an I/O was slow to complete as defined by the zio_delay_max module
-option.
+Issued when a completed I/O exceeds the maximum allowed time specified
+by the \fBzio_delay_max\fR module option. This can be an indicator of
+problems with the underlying storage device.
.RE
.sp
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 5b7a29d32..039e024bb 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -823,14 +823,36 @@ Default value: \fB0\fR.
.ad
.RS 12n
When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR
-milliseconds, a "slow spa_sync" message is logged to the debug log
-(see \fBzfs_dbgmsg_enable\fR). If \fBzfs_deadman_enabled\fR is set,
-all pending IO operations are also checked and if any haven't completed
-within \fBzfs_deadman_synctime_ms\fR milliseconds, a "SLOW IO" message
-is logged to the debug log and a "delay" system event with the details of
-the hung IO is posted.
+milliseconds, or when an individual I/O takes longer than
+\fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to
+be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is
+invoked as described by the \fBzfs_deadman_failmode\fR module option.
+By default the deadman is enabled and configured to \fBwait\fR which results
+in "hung" I/Os only being logged. The deadman is automatically disabled
+when a pool gets suspended.
.sp
-Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable.
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_deadman_failmode\fR (charp)
+.ad
+.RS 12n
+Controls the failure behavior when the deadman detects a "hung" I/O. Valid
+values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR.
+.sp
+\fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a
+"deadman" event will be posted describing that I/O.
+.sp
+\fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it
+to the I/O pipeline if possible.
+.sp
+\fBpanic\fR - Panic the system. This can be used to facilitate an automatic
+fail-over to a properly configured fail-over partner.
+.sp
+Default value: \fBwait\fR.
.RE
.sp
@@ -839,11 +861,10 @@ Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable.
\fBzfs_deadman_checktime_ms\fR (int)
.ad
.RS 12n
-Once a pool sync operation has taken longer than
-\fBzfs_deadman_synctime_ms\fR milliseconds, continue to check for slow
-operations every \fBzfs_deadman_checktime_ms\fR milliseconds.
+Check time in milliseconds. This defines the frequency at which we check
+for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior.
.sp
-Default value: \fB5,000\fR.
+Default value: \fB60,000\fR.
.RE
.sp
@@ -853,12 +874,25 @@ Default value: \fB5,000\fR.
.ad
.RS 12n
Interval in milliseconds after which the deadman is triggered and also
-the interval after which an IO operation is considered to be "hung"
-if \fBzfs_deadman_enabled\fR is set.
+the interval after which a pool sync operation is considered to be "hung".
+Once this limit is exceeded the deadman will be invoked every
+\fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes.
+.sp
+Default value: \fB600,000\fR.
+.RE
-See \fBzfs_deadman_enabled\fR.
.sp
-Default value: \fB1,000,000\fR.
+.ne 2
+.na
+\fBzfs_deadman_ziotime_ms\fR (ulong)
+.ad
+.RS 12n
+Interval in milliseconds after which the deadman is triggered and an
+individual IO operation is considered to be "hung". As long as the I/O
+remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
+milliseconds until the I/O completes.
+.sp
+Default value: \fB300,000\fR.
.RE
.sp
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index dc0f8d5fe..e742af255 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -292,18 +292,25 @@ int zfs_free_leak_on_eio = B_FALSE;
/*
* Expiration time in milliseconds. This value has two meanings. First it is
* used to determine when the spa_deadman() logic should fire. By default the
- * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
* Secondly, the value determines if an I/O is considered "hung". Any I/O that
* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
- * in a system panic.
+ * in one of three behaviors controlled by zfs_deadman_failmode.
*/
-unsigned long zfs_deadman_synctime_ms = 1000000ULL;
+unsigned long zfs_deadman_synctime_ms = 600000ULL;
+
+/*
+ * This value controls the maximum amount of time zio_wait() will block for an
+ * outstanding IO. By default this is 300 seconds at which point the "hung"
+ * behavior will be applied as described for zfs_deadman_synctime_ms.
+ */
+unsigned long zfs_deadman_ziotime_ms = 300000ULL;
/*
* Check time in milliseconds. This defines the frequency at which we check
* for hung I/O.
*/
-unsigned long zfs_deadman_checktime_ms = 5000ULL;
+unsigned long zfs_deadman_checktime_ms = 60000ULL;
/*
* By default the deadman is enabled.
@@ -311,6 +318,16 @@ unsigned long zfs_deadman_checktime_ms = 5000ULL;
int zfs_deadman_enabled = 1;
/*
+ * Controls the behavior of the deadman when it detects a "hung" I/O.
+ * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
+ *
+ * wait - Wait for the "hung" I/O (default)
+ * continue - Attempt to recover from a "hung" I/O
+ * panic - Panic the system
+ */
+char *zfs_deadman_failmode = "wait";
+
+/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
* times the size; so just assume that. Add to this the fact that
@@ -536,7 +553,7 @@ spa_deadman(void *arg)
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
++spa->spa_deadman_calls);
if (zfs_deadman_enabled)
- vdev_deadman(spa->spa_root_vdev);
+ vdev_deadman(spa->spa_root_vdev, FTAG);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@@ -590,6 +607,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_proc_state = SPA_PROC_NONE;
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
+ spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
+ spa_set_deadman_failmode(spa, zfs_deadman_failmode);
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
@@ -1681,7 +1700,7 @@ spa_update_dspace(spa_t *spa)
* Return the failure mode that has been set to this pool. The default
* behavior will be to block all I/Os when a complete failure occurs.
*/
-uint8_t
+uint64_t
spa_get_failmode(spa_t *spa)
{
return (spa->spa_failmode);
@@ -1771,6 +1790,31 @@ spa_deadman_synctime(spa_t *spa)
}
uint64_t
+spa_deadman_ziotime(spa_t *spa)
+{
+ return (spa->spa_deadman_ziotime);
+}
+
+uint64_t
+spa_get_deadman_failmode(spa_t *spa)
+{
+ return (spa->spa_deadman_failmode);
+}
+
+void
+spa_set_deadman_failmode(spa_t *spa, const char *failmode)
+{
+ if (strcmp(failmode, "wait") == 0)
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+ else if (strcmp(failmode, "continue") == 0)
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
+ else if (strcmp(failmode, "panic") == 0)
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
+ else
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+}
+
+uint64_t
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
uint64_t asize = DVA_GET_ASIZE(dva);
@@ -2106,6 +2150,33 @@ spa_get_hostid(void)
}
#if defined(_KERNEL) && defined(HAVE_SPL)
+
+#include <linux/mod_compat.h>
+
+static int
+param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
+{
+ spa_t *spa = NULL;
+ char *p;
+
+ if (val == NULL)
+ return (SET_ERROR(-EINVAL));
+
+ if ((p = strchr(val, '\n')) != NULL)
+ *p = '\0';
+
+ if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
+ strcmp(val, "panic"))
+ return (SET_ERROR(-EINVAL));
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL)
+ spa_set_deadman_failmode(spa, val);
+ mutex_exit(&spa_namespace_lock);
+
+ return (param_set_charp(val, kp));
+}
+
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
EXPORT_SYMBOL(spa_add);
@@ -2196,7 +2267,12 @@ MODULE_PARM_DESC(zfs_free_leak_on_eio,
"Set to ignore IO errors during free and permanently leak the space");
module_param(zfs_deadman_synctime_ms, ulong, 0644);
-MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds");
+MODULE_PARM_DESC(zfs_deadman_synctime_ms,
+ "Pool sync expiration time in milliseconds");
+
+module_param(zfs_deadman_ziotime_ms, ulong, 0644);
+MODULE_PARM_DESC(zfs_deadman_ziotime_ms,
+ "IO expiration time in milliseconds");
module_param(zfs_deadman_checktime_ms, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_checktime_ms,
@@ -2205,6 +2281,10 @@ MODULE_PARM_DESC(zfs_deadman_checktime_ms,
module_param(zfs_deadman_enabled, int, 0644);
MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
+module_param_call(zfs_deadman_failmode, param_set_deadman_failmode,
+ param_get_charp, &zfs_deadman_failmode, 0644);
+MODULE_PARM_DESC(zfs_deadman_failmode, "Failmode for deadman timer");
+
module_param(spa_asize_inflation, int, 0644);
MODULE_PARM_DESC(spa_asize_inflation,
"SPA size estimate multiplication factor");
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 9edeaf525..7fd0bd1ac 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3707,12 +3707,12 @@ vdev_split(vdev_t *vd)
}
void
-vdev_deadman(vdev_t *vd)
+vdev_deadman(vdev_t *vd, char *tag)
{
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
- vdev_deadman(cvd);
+ vdev_deadman(cvd, tag);
}
if (vd->vdev_ops->vdev_op_leaf) {
@@ -3724,21 +3724,18 @@ vdev_deadman(vdev_t *vd)
zio_t *fio;
uint64_t delta;
+ zfs_dbgmsg("slow vdev: %s has %d active IOs",
+ vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
- * the spa_deadman_synctime we log a zevent.
+ * the spa_deadman_synctime invoke the deadman logic.
*/
fio = avl_first(&vq->vq_active_tree);
delta = gethrtime() - fio->io_timestamp;
- if (delta > spa_deadman_synctime(spa)) {
- zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
- "delta %lluns, last io %lluns",
- fio->io_timestamp, delta,
- vq->vq_io_complete_ts);
- zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
- spa, vd, &fio->io_bookmark, fio, 0, 0);
- }
+ if (delta > spa_deadman_synctime(spa))
+ zio_deadman(fio, tag);
}
mutex_exit(&vq->vq_lock);
}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 7d2370034..263c77e4a 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1748,6 +1748,80 @@ zio_delay_interrupt(zio_t *zio)
zio_interrupt(zio);
}
+static void
+zio_deadman_impl(zio_t *pio)
+{
+ zio_t *cio, *cio_next;
+ zio_link_t *zl = NULL;
+ vdev_t *vd = pio->io_vd;
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_t *vq = &vd->vdev_queue;
+ zbookmark_phys_t *zb = &pio->io_bookmark;
+ uint64_t delta = gethrtime() - pio->io_timestamp;
+ uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
+
+ zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
+ "delta=%llu queued=%llu io=%llu "
+ "path=%s last=%llu "
+ "type=%d priority=%d flags=0x%x "
+ "stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
+ "objset=%llu object=%llu level=%llu blkid=%llu "
+ "offset=%llu size=%llu error=%d",
+ pio, pio->io_timestamp,
+ delta, pio->io_delta, pio->io_delay,
+ vd->vdev_path, vq->vq_io_complete_ts,
+ pio->io_type, pio->io_priority, pio->io_flags,
+ pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ pio->io_offset, pio->io_size, pio->io_error);
+ zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
+ pio->io_spa, vd, zb, pio, 0, 0);
+
+ if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
+ taskq_empty_ent(&pio->io_tqent)) {
+ zio_interrupt(pio);
+ }
+ }
+
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ zio_deadman_impl(cio);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Log the critical information describing this zio and all of its children
+ * using the zfs_dbgmsg() interface then post deadman event for the ZED.
+ */
+void
+zio_deadman(zio_t *pio, char *tag)
+{
+ spa_t *spa = pio->io_spa;
+ char *name = spa_name(spa);
+
+ if (!zfs_deadman_enabled || spa_suspended(spa))
+ return;
+
+ zio_deadman_impl(pio);
+
+ switch (spa_get_deadman_failmode(spa)) {
+ case ZIO_FAILURE_MODE_WAIT:
+ zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
+ break;
+
+ case ZIO_FAILURE_MODE_CONTINUE:
+ zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
+ break;
+
+ case ZIO_FAILURE_MODE_PANIC:
+ fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
+ break;
+ }
+}
+
/*
* Execute the I/O pipeline until one of the following occurs:
* (1) the I/O completes; (2) the pipeline stalls waiting for
@@ -1877,6 +1951,7 @@ __zio_execute(zio_t *zio)
int
zio_wait(zio_t *zio)
{
+ long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
int error;
ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
@@ -1889,8 +1964,19 @@ zio_wait(zio_t *zio)
__zio_execute(zio);
mutex_enter(&zio->io_lock);
- while (zio->io_executor != NULL)
- cv_wait_io(&zio->io_cv, &zio->io_lock);
+ while (zio->io_executor != NULL) {
+ error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
+ ddi_get_lbolt() + timeout);
+
+ if (zfs_deadman_enabled && error == -1 &&
+ gethrtime() - zio->io_queued_timestamp >
+ spa_deadman_ziotime(zio->io_spa)) {
+ mutex_exit(&zio->io_lock);
+ timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
+ zio_deadman(zio, FTAG);
+ mutex_enter(&zio->io_lock);
+ }
+ }
mutex_exit(&zio->io_lock);
error = zio->io_error;
diff --git a/scripts/zloop.sh b/scripts/zloop.sh
index 4e0afac5b..50ceb4dfa 100755
--- a/scripts/zloop.sh
+++ b/scripts/zloop.sh
@@ -115,7 +115,10 @@ function store_core
or_die mv ztest.ddt "$dest/"
or_die mv ztest.out "$dest/"
or_die mv "$workdir/ztest*" "$dest/vdev/"
- or_die mv "$workdir/zpool.cache" "$dest/vdev/"
+
+ if [[ -e "$workdir/zpool.cache" ]]; then
+ or_die mv "$workdir/zpool.cache" "$dest/vdev/"
+ fi
# check for core
if [[ -f "$core" ]]; then
@@ -141,8 +144,6 @@ function store_core
# Record info in cores logfile
echo "*** core @ $coredir/$coreid/$core:" | \
tee -a ztest.cores
- echo "$corestatus" | tee -a ztest.cores
- echo "" | tee -a ztest.cores
fi
echo "continuing..."
fi
@@ -203,7 +204,7 @@ curtime=$starttime
# if no timeout was specified, loop forever.
while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
- zopt="-VVVVV"
+ zopt="-G -VVVVV"
# start each run with an empty directory
workdir="$basedir/$rundir"
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 0cecdd673..af96e6a64 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -440,6 +440,12 @@ tags = ['functional', 'compression']
tests = ['ctime_001_pos' ]
tags = ['functional', 'ctime']
+[tests/functional/deadman]
+tests = ['deadman_sync', 'deadman_zio']
+pre =
+post =
+tags = ['functional', 'deadman']
+
[tests/functional/delegate]
tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos',
'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index dc4bf51b4..49add883e 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -12,6 +12,7 @@ SUBDIRS = \
cli_user \
compression \
ctime \
+ deadman \
delegate \
devices \
events \
diff --git a/tests/zfs-tests/tests/functional/deadman/Makefile.am b/tests/zfs-tests/tests/functional/deadman/Makefile.am
new file mode 100644
index 000000000..9c4b0713c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/deadman/Makefile.am
@@ -0,0 +1,5 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman
+dist_pkgdata_SCRIPTS = \
+ deadman.cfg \
+ deadman_sync.ksh \
+ deadman_zio.ksh
diff --git a/tests/zfs-tests/tests/functional/deadman/deadman.cfg b/tests/zfs-tests/tests/functional/deadman/deadman.cfg
new file mode 100644
index 000000000..e767f3dd5
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/deadman/deadman.cfg
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+export DISK1=${DISKS%% *}
+
+export SYNCTIME_DEFAULT=600000
+export ZIOTIME_DEFAULT=300000
+export CHECKTIME_DEFAULT=60000
+export FAILMODE_DEFAULT="wait"
diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh
new file mode 100755
index 000000000..a5537c435
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh
@@ -0,0 +1,86 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+# DESCRIPTION:
+# Verify spa deadman detects a hung txg
+#
+# STRATEGY:
+# 1. Reduce the zfs_deadman_synctime_ms to 5s.
+# 2. Reduce the zfs_deadman_checktime_ms to 1s.
+# 3. Inject a 10s zio delay to force long IOs.
+# 4. Write enough data to force a long txg sync time due to the delay.
+# 5. Verify a "deadman" event is posted.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/deadman/deadman.cfg
+
+verify_runnable "both"
+
+function cleanup
+{
+ log_must zinject -c all
+ default_cleanup_noexit
+
+ log_must set_tunable64 zfs_deadman_synctime_ms $SYNCTIME_DEFAULT
+ log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT
+ log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT
+}
+
+log_assert "Verify spa deadman detects a hung txg"
+log_onexit cleanup
+
+log_must set_tunable64 zfs_deadman_synctime_ms 5000
+log_must set_tunable64 zfs_deadman_checktime_ms 1000
+log_must set_tunable64 zfs_deadman_failmode "wait"
+
+# Create a new pool in order to use the updated deadman settings.
+default_setup_noexit $DISK1
+log_must zpool events -c
+
+# Force each IO to take 10s by allow them to run concurrently.
+log_must zinject -d $DISK1 -D10000:10 $TESTPOOL
+
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file
+sleep 10
+
+log_must zinject -c all
+log_must zpool sync
+
+# Log txg sync times for reference and the zpool event summary.
+log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs
+log_must zpool events
+
+# Verify at least 5 deadman events were logged. The first after 5 seconds,
+# and another each second thereafter until the delay is clearer.
+events=$(zpool events | grep -c ereport.fs.zfs.deadman)
+if [ "$events" -lt 5 ]; then
+ log_fail "Expect >=5 deadman events, $events found"
+fi
+
+log_pass "Verify spa deadman detected a hung txg and $events deadman events"
diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh
new file mode 100755
index 000000000..a61be995a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh
@@ -0,0 +1,113 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+# DESCRIPTION:
+# Verify zio deadman detects a hung zio
+#
+# STRATEGY:
+# 1. Reduce the zfs_deadman_ziotime_ms to 5s.
+# 2. Reduce the zfs_deadman_checktime_ms to 1s.
+# 3. Inject a 10s zio delay to force long IOs.
+# 4. Read an uncached file in the background.
+# 5. Verify a "deadman" event is posted.
+# 6. Inject a 100ms zio delay which is under the 5s allowed.
+# 7. Read an uncached file in the background.
+# 8. Verify a "deadman" event is not posted.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/deadman/deadman.cfg
+
+verify_runnable "both"
+
+function cleanup
+{
+ log_must zinject -c all
+ default_cleanup_noexit
+
+ log_must set_tunable64 zfs_deadman_ziotime_ms $ZIOTIME_DEFAULT
+ log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT
+ log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT
+}
+
+log_assert "Verify zio deadman detects a hung zio"
+log_onexit cleanup
+
+# 1. Reduce the zfs_deadman_ziotime_ms to 5s.
+log_must set_tunable64 zfs_deadman_ziotime_ms 5000
+# 2. Reduce the zfs_deadman_checktime_ms to 1s.
+log_must set_tunable64 zfs_deadman_checktime_ms 1000
+log_must set_tunable64 zfs_deadman_failmode "wait"
+
+# Create a new pool in order to use the updated deadman settings.
+default_setup_noexit $DISK1
+
+# Write a file and export/import the pool to clear the ARC.
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file1
+log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file2
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must stat -t /$mntpnt/file1
+log_must stat -t /$mntpnt/file2
+
+# 3. Inject a 10s zio delay to force long IOs and serialize them..
+log_must zpool events -c
+log_must zinject -d $DISK1 -D10000:1 $TESTPOOL
+
+# 4. Read an uncached file in the background, it's expected to hang.
+log_must eval "dd if=/$mntpnt/file1 of=/dev/null bs=1048576 &"
+sleep 10
+log_must zinject -c all
+log_must zpool sync
+wait
+
+# 5. Verify a "deadman" event is posted. The first appears after 5
+# seconds, and another each second thereafter until the delay is cleared.
+events=$(zpool events | grep -c ereport.fs.zfs.deadman)
+if [ "$events" -lt 1 ]; then
+ log_fail "Expect >=1 deadman events, $events found"
+fi
+
+# 6. Inject a 100ms zio delay which is under the 5s allowed, allow them
+# to run concurrently so they don't get starved in the queue.
+log_must zpool events -c
+log_must zinject -d $DISK1 -D100:10 $TESTPOOL
+
+# 7. Read an uncached file in the background.
+log_must eval "dd if=/$mntpnt/file2 of=/dev/null bs=1048576 &"
+sleep 10
+log_must zinject -c all
+wait
+
+# 8. Verify a "deadman" event is not posted.
+events=$(zpool events | grep -c ereport.fs.zfs.deadman)
+if [ "$events" -ne 0 ]; then
+ log_fail "Expect 0 deadman events, $events found"
+fi
+
+log_pass "Verify zio deadman detected a hung zio and $events deadman events"