summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Caputi <[email protected]>2018-10-10 16:48:33 -0400
committerBrian Behlendorf <[email protected]>2018-10-24 14:36:21 -0700
commit9410257800789cde96abf135663930d0f44661e6 (patch)
tree367ea0f0053fcfb3b57e2810f30faa693fc14563
parente871a8f0584a3259c342f8e3e817159600302832 (diff)
Fix random ztest_deadman_thread failures
The zloop test has been failing in buildbot for the last few weeks with various failures in ztest_deadman_thread(). This is due to the fact that this thread is not stopped when performing pool import / export tests as it should be. This patch simply corrects this. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Reviewed-by: Matthew Ahrens <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Closes #8010
-rw-r--r--cmd/ztest/ztest.c36
-rw-r--r--module/zfs/spa_misc.c2
2 files changed, 26 insertions, 12 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index f277312c6..47f3cdf12 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -6495,13 +6495,20 @@ ztest_deadman_thread(void *arg)
{
ztest_shared_t *zs = arg;
spa_t *spa = ztest_spa;
- hrtime_t delta, overdue, total = 0;
+ hrtime_t delay, overdue, last_run = gethrtime();
- for (;;) {
- delta = zs->zs_thread_stop - zs->zs_thread_start +
- MSEC2NSEC(zfs_deadman_synctime_ms);
+ delay = (zs->zs_thread_stop - zs->zs_thread_start) +
+ MSEC2NSEC(zfs_deadman_synctime_ms);
- (void) poll(NULL, 0, (int)NSEC2MSEC(delta));
+ while (!ztest_exiting) {
+ /*
+ * Wait for the delay timer while checking occasionally
+ * if we should stop.
+ */
+ if (gethrtime() < last_run + delay) {
+ (void) poll(NULL, 0, 1000);
+ continue;
+ }
/*
* If the pool is suspended then fail immediately. Otherwise,
@@ -6522,15 +6529,20 @@ ztest_deadman_thread(void *arg)
* then it may be hung and is terminated.
*/
overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
- total += zfs_deadman_synctime_ms / 1000;
if (gethrtime() > overdue) {
fatal(0, "aborting test after %llu seconds because "
- "the process is overdue for termination.", total);
+ "the process is overdue for termination.",
+ (gethrtime() - zs->zs_proc_start) / NANOSEC);
}
(void) printf("ztest has been running for %lld seconds\n",
- total);
+ (gethrtime() - zs->zs_proc_start) / NANOSEC);
+
+ last_run = gethrtime();
+ delay = MSEC2NSEC(zfs_deadman_checktime_ms);
}
+
+ thread_exit();
}
static void
@@ -6724,7 +6736,7 @@ ztest_run(ztest_shared_t *zs)
{
spa_t *spa;
objset_t *os;
- kthread_t *resume_thread;
+ kthread_t *resume_thread, *deadman_thread;
kthread_t **run_threads;
uint64_t object;
int error;
@@ -6782,7 +6794,7 @@ ztest_run(ztest_shared_t *zs)
/*
* Create a deadman thread and set to panic if we hang.
*/
- (void) thread_create(NULL, 0, ztest_deadman_thread,
+ deadman_thread = thread_create(NULL, 0, ztest_deadman_thread,
zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
@@ -6849,9 +6861,10 @@ ztest_run(ztest_shared_t *zs)
umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
- /* Kill the resume thread */
+ /* Kill the resume and deadman threads */
ztest_exiting = B_TRUE;
VERIFY0(thread_join(resume_thread));
+ VERIFY0(thread_join(deadman_thread));
ztest_resume(spa);
/*
@@ -7351,6 +7364,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
zfs_deadman_synctime_ms = 300000;
+ zfs_deadman_checktime_ms = 30000;
/*
* As two-word space map entries may not come up often (especially
* if pool and vdev sizes are small) we want to force at least some
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index ae9eb4de7..a3ac70f07 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -312,7 +312,7 @@ unsigned long zfs_deadman_ziotime_ms = 300000ULL;
* Check time in milliseconds. This defines the frequency at which we check
* for hung I/O.
*/
-unsigned long zfs_deadman_checktime_ms = 60000ULL;
+unsigned long zfs_deadman_checktime_ms = 60000ULL;
/*
* By default the deadman is enabled.