diff options
author | John Poduska <[email protected]> | 2020-05-13 13:54:27 -0400 |
---|---|---|
committer | GitHub <[email protected]> | 2020-05-13 10:54:27 -0700 |
commit | 41035a049643ff7083a6cb6cd43b8eb70a7d18a1 (patch) | |
tree | 121cec91a2b10ca6e9fb7d376a7599d47f4aee8c /tests/zfs-tests | |
parent | b29e31d80d6cb78dbd889e9b529333944b4c3ba1 (diff) |
Resilver restarts unnecessarily when it encounters errors
When a resilver finishes, vdev_dtl_reassess is called to hopefully
excise DTL_MISSING (amongst other things). If there are errors during
the resilver, they are tracked in DTL_SCRUB, as spelled out in the
block comment in vdev.c. DTL_SCRUB is in-core only, so it can only
be used if the pool was online for the whole resilver. This state is
tracked with the spa_scrub_started flag, which only gets set when
the scan is initialized. Unfortunately, this flag gets cleared right
before vdev_dtl_reassess gets called, so if there are any errors
during the scan, DTL_MISSING will never get excised and the resilver
will just continually restart. This fix simply moves clearing that
flag until after the call to vdev_dtl_reasses.
In addition, if a pool is imported and already has scn_errors > 0,
this change will restart the resilver immediately instead of doing
the rest of the scan and then restarting it from the beginning. On
the other hand, if scn_errors == 0 at import, then no errors have
been encountered so far, so the spa_scrub_started flag can be safely
set.
A test has been added to verify that resilver does not restart when
relevant DTL's are available.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Paul Zuchowski <[email protected]>
Signed-off-by: John Poduska <[email protected]>
Closes #10291
Diffstat (limited to 'tests/zfs-tests')
-rw-r--r-- | tests/zfs-tests/include/tunables.cfg | 1 | ||||
-rw-r--r-- | tests/zfs-tests/tests/functional/resilver/Makefile.am | 3 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh | 102 |
3 files changed, 105 insertions, 1 deletions
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index f85b15658..efbcc09e7 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -59,6 +59,7 @@ OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_esti REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms +SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am index 465d8f3a3..38136a843 100644 --- a/tests/zfs-tests/tests/functional/resilver/Makefile.am +++ b/tests/zfs-tests/tests/functional/resilver/Makefile.am @@ -2,7 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - resilver_restart_001.ksh + resilver_restart_001.ksh \ + resilver_restart_002.ksh dist_pkgdata_DATA = \ resilver.cfg diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh new file mode 100755 index 000000000..ebe5e693b --- /dev/null +++ b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Datto Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/resilver/resilver.cfg + +# +# DESCRIPTION: +# Testing resilver completes when scan errors are encountered, but relevant +# DTL's have not been lost. +# +# STRATEGY: +# 1. Create a pool (1k recordsize) +# 2. Create a 32m file (32k records) +# 3. Inject an error halfway through the file +# 4. Start a resilver, ensure the error is triggered and that the resilver +# does not restart after finishing +# +# NB: use legacy scanning to ensure scan of specific block causes error +# + +function cleanup +{ + log_must zinject -c all + destroy_pool $TESTPOOL + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE + log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY +} + +log_assert "Check for resilver restarts caused by scan errors" + +ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY) + +log_onexit cleanup + +# use legacy scan to ensure injected error will be triggered +log_must set_tunable32 SCAN_LEGACY 1 + + # create the pool and a 32M file (32k blocks) +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE +log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]} +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1 + +# determine objset/object +objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') +object=$(ls -i /$TESTPOOL/file | awk '{print $1}') + +# inject event to cause error during resilver +log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL + +# clear events and start resilver +log_must zpool events -c +log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE + +log_note "waiting for read errors to start showing up" +for iter in {0..59} +do + zpool sync $TESTPOOL + err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}') + (( $err > 0 )) && break + sleep 1 +done + +(( $err == 0 )) && log_fail "Unable to induce errors in resilver" + +log_note "waiting for resilver to finish" +for iter in {0..59} +do + finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l) + (( $finish > 0 )) && break + sleep 1 +done + +(( $finish == 0 )) && log_fail "resilver took too long to finish" + +# wait a few syncs to ensure that zfs does not restart the resilver +log_must zpool sync $TESTPOOL +log_must zpool sync $TESTPOOL + +# check if resilver was restarted +start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) +(( $start != 1 )) && log_fail "resilver restarted unnecessarily" + +log_pass "Resilver did not restart unnecessarily from scan errors" |