diff options
author | Tom Caputi <[email protected]> | 2018-11-28 13:12:08 -0500 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-11-28 10:12:08 -0800 |
commit | cef48f14da6eec8a9095a0272f1cd7d9d60ca476 (patch) | |
tree | 52f2cb54823712b25c09d0ef3af05067d98b0042 /tests | |
parent | 00369f333809c2d38629abd8ff1fe0383ccd5154 (diff) |
Remove races from scrub / resilver tests
Currently, several tests in the ZFS Test Suite that attempt to
test scrub and resilver behavior occasionally fail. A big reason
for this is that these tests use a combination of zinject and
zfs_scan_vdev_limit to attempt to slow these operations enough
to verify their test commands. This method works most of the time,
but provides no guarantees and leads to flaky behavior. This patch
adds a new tunable, zfs_scan_suspend_progress, that ensures that
scans make no progress, guaranteeing that tests can be run without
racing.
This patch also changes zfs_remove_max_bytes_pause to match this
new tunable. This provides some consistency between these two
similar tunables and ensures that the tunable will not misbehave
on 32-bit systems.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Giuseppe Di Natale <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Tom Caputi <[email protected]>
Closes #8111
Diffstat (limited to 'tests')
12 files changed, 34 insertions, 78 deletions
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh index fd67dc769..029fa6681 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh @@ -34,6 +34,8 @@ verify_runnable "global" +log_must set_tunable32 zfs_scan_suspend_progress 0 + for pool in "$TESTPOOL" "$TESTPOOL1"; do datasetexists $pool/$TESTFS && \ log_must zfs destroy -Rf $pool/$TESTFS diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh index f42c85b98..e7edb1a3b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh @@ -42,10 +42,10 @@ # each sync. # 2. Add data to pool # 3. Re-import the pool so that data isn't cached -# 4. Use zinject to slow down device I/O +# 4. Use zfs_scan_suspend_progress to ensure resilvers don't progress # 5. Trigger the resilvering # 6. Use spa freeze to stop writing to the pool. -# 7. Clear zinject events (needed to export the pool) +# 7. Re-enable scan progress # 8. Export the pool # @@ -59,8 +59,7 @@ function custom_cleanup [[ -n ZFS_TXG_TIMEOUT ]] && log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT - zinject -c all - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable32 zfs_scan_suspend_progress 0 cleanup } @@ -88,24 +87,16 @@ function test_replacing_vdevs log_must zpool export $TESTPOOL1 log_must cp $CPATHBKP $CPATH log_must zpool import -c $CPATH -o cachefile=$CPATH $TESTPOOL1 - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW - typeset device - for device in $zinjectdevices ; do - log_must zinject -d $device -D 50:1 $TESTPOOL1 > /dev/null - done + log_must set_tunable32 zfs_scan_suspend_progress 1 log_must zpool replace $TESTPOOL1 $replacevdev $replaceby # Cachefile: pool in resilvering state log_must cp $CPATH $CPATHBKP2 - # We must disable zinject in order to export the pool, so we freeze - # it first to prevent writing out subsequent resilvering progress. - log_must zpool freeze $TESTPOOL1 # Confirm pool is still replacing log_must pool_is_replacing $TESTPOOL1 - log_must zinject -c all > /dev/null - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT log_must zpool export $TESTPOOL1 + log_must set_tunable32 zfs_scan_suspend_progress 0 ( $earlyremove ) && log_must rm $replacevdev diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh index 574c19275..bc2c611ae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh @@ -63,7 +63,7 @@ function custom_cleanup [[ -n ZFS_TXG_TIMEOUT ]] && log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_must rm -rf $BACKUP_DEVICE_DIR - zinject -c all + log_must set_tunable32 zfs_scan_suspend_progress 0 cleanup } @@ -98,22 +98,17 @@ function test_replace_vdev # This should not free original data. log_must overwrite_data $TESTPOOL1 "" - # Steps to insure resilvering happens very slowly. log_must zpool export $TESTPOOL1 log_must zpool import -d $DEVICE_DIR $TESTPOOL1 - typeset device - for device in $zinjectdevices ; do - log_must zinject -d $device -D 200:1 $TESTPOOL1 > /dev/null - done + + # Ensure resilvering doesn't complete. + log_must set_tunable32 zfs_scan_suspend_progress 1 log_must zpool replace $TESTPOOL1 $replacevdev $replaceby - # We must disable zinject in order to export the pool, so we freeze - # it first to prevent writing out subsequent resilvering progress. - log_must zpool freeze $TESTPOOL1 # Confirm pool is still replacing log_must pool_is_replacing $TESTPOOL1 - log_must zinject -c all > /dev/null log_must zpool export $TESTPOOL1 + log_must set_tunable32 zfs_scan_suspend_progress 0 ############################################################ # Test 1: rewind while device is resilvering. diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index ecdf0ee53..79423abe2 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -137,6 +137,3 @@ export VDEV3=$DEVICE_DIR/${DEVICE_FILE}3 export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 export ALTER_ROOT=/alter_import-test - -export ZFS_SCAN_VDEV_LIMIT_SLOW=$((128*1024)) -export ZFS_SCAN_VDEV_LIMIT_DEFAULT=$((4*1024*1024)) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver.cfg index 7d92984d6..5c013c723 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver.cfg @@ -27,7 +27,4 @@ export DISK1=$(echo $DISKS | nawk '{print $1}') export DISK2=$(echo $DISKS | nawk '{print $2}') export DISK3=$(echo $DISKS | nawk '{print $3}') -export ZFS_SCAN_VDEV_LIMIT_SLOW=$((128*1024)) -export ZFS_SCAN_VDEV_LIMIT_DEFAULT=$((4*1024*1024)) - export MAXTIMEOUT=80 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh index de9e5ecdf..4f98ced96 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh @@ -40,18 +40,12 @@ # deferred # 4. Manually restart the resilver with all drives # -# NOTES: -# Artificially limit the scrub speed by setting the zfs_scan_vdev_limit -# low and adding a 50ms zio delay in order to ensure that the resilver -# does not complete early. -# verify_runnable "global" function cleanup { - log_must zinject -c all - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable32 zfs_scan_suspend_progress 0 log_must rm -f $mntpnt/biggerfile1 log_must rm -f $mntpnt/biggerfile2 } @@ -73,22 +67,19 @@ log_must sync log_must zpool detach $TESTPOOL $DISK3 # 3. Reattach the drives, causing the second drive's resilver to be deferred -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW +log_must set_tunable32 zfs_scan_suspend_progress 1 log_must zpool attach $TESTPOOL $DISK1 $DISK2 -log_must zinject -d $DISK2 -D50:1 $TESTPOOL log_must is_pool_resilvering $TESTPOOL true log_must zpool attach $TESTPOOL $DISK1 $DISK3 -log_must zinject -d $DISK3 -D50:1 $TESTPOOL log_must is_pool_resilvering $TESTPOOL true # 4. Manually restart the resilver with all drives log_must zpool resilver $TESTPOOL -log_must zinject -c all -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT -log_must wait_for_resilver_end $TESTPOOL $MAXTIMEOUT log_must is_deferred_scan_started $TESTPOOL +log_must set_tunable32 zfs_scan_suspend_progress 0 +log_must wait_for_resilver_end $TESTPOOL $MAXTIMEOUT log_must check_state $TESTPOOL "$DISK2" "online" log_must check_state $TESTPOOL "$DISK3" "online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh index e8bb8bceb..b3cb58ceb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh @@ -30,5 +30,5 @@ verify_runnable "global" -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT +log_must set_tunable32 zfs_scan_suspend_progress 0 destroy_mirrors diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh index 712097bb1..71a204060 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh @@ -45,18 +45,12 @@ # 5. Resume the paused scrub and verify scrub is again being performed. # 6. Verify zpool scrub -s succeed when the system is scrubbing. # -# NOTES: -# Artificially limit the scrub speed by setting the zfs_scan_vdev_limit -# low and adding a 50ms zio delay in order to ensure that the scrub does -# not complete early. -# verify_runnable "global" function cleanup { - log_must zinject -c all - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable32 zfs_scan_suspend_progress 0 log_must rm -f $mntpnt/biggerfile } @@ -69,8 +63,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) log_must file_write -b 1048576 -c 1024 -o create -d 0 -f $mntpnt/biggerfile log_must sync -log_must zinject -d $DISK1 -D50:1 $TESTPOOL -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW +log_must set_tunable32 zfs_scan_suspend_progress 1 log_must zpool scrub $TESTPOOL log_must is_pool_scrubbing $TESTPOOL true log_must zpool scrub -p $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh index c52ad84bc..56225456b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh @@ -42,23 +42,19 @@ # 2. Kick off a scrub # 2. Kick off a second scrub and verify it fails # -# NOTES: -# Artificially limit the scrub speed by setting the zfs_scan_vdev_limit -# low in order to ensure that the scrub does not complete early. -# verify_runnable "global" function cleanup { - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable32 zfs_scan_suspend_progress 0 } log_onexit cleanup log_assert "Scrub command fails when there is already a scrub in progress" -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW +log_must set_tunable32 zfs_scan_suspend_progress 1 log_must zpool scrub $TESTPOOL log_must is_pool_scrubbing $TESTPOOL true log_mustnot zpool scrub $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh index 14563d64d..9b6274cd1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh @@ -43,14 +43,10 @@ # 4. Export/import the pool to ensure the cache is dropped # 5. Verify scrub failed until the resilver completed # -# NOTES: -# Artificially limit the scrub speed by setting the zfs_scan_vdev_limit -# low in order to ensure that the scrub does not complete early. -# function cleanup { - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable32 zfs_scan_suspend_progress 0 rm -f $mntpnt/extra } @@ -61,7 +57,9 @@ log_onexit cleanup log_assert "Resilver prevent scrub from starting until the resilver completes" mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW + +# Temporarily prevent scan progress so our test doesn't race +log_must set_tunable32 zfs_scan_suspend_progress 1 while ! is_pool_resilvering $TESTPOOL; do log_must zpool detach $TESTPOOL $DISK2 @@ -74,6 +72,7 @@ done log_must is_pool_resilvering $TESTPOOL log_mustnot zpool scrub $TESTPOOL +log_must set_tunable32 zfs_scan_suspend_progress 0 while ! is_pool_resilvered $TESTPOOL; do sleep 1 done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh index ffc841f76..1a5c3198f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh @@ -41,7 +41,7 @@ verify_runnable "both" function cleanup { - log_must zinject -c all + log_must set_tunable32 zfs_scan_suspend_progress 0 destroy_pool $TESTPOOL destroy_pool $TESTPOOL2 rm -f $DEVICE1 $DEVICE2 @@ -68,10 +68,8 @@ function zpool_split #disk_to_be_offline/online log_must file_write -b 2097152 -c 1024 -o create -d 0 -f $mntpnt/biggerfile log_must sync - # slow-down resilvering, so it will not finish too early - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW - log_must zinject -d $DEVICE1 -D 50:1 $TESTPOOL - log_must zinject -d $DEVICE2 -D 50:1 $TESTPOOL + # temporarily prevent resilvering progress, so it will not finish too early + log_must set_tunable32 zfs_scan_suspend_progress 1 log_must zpool online $TESTPOOL $disk @@ -86,7 +84,7 @@ function zpool_split #disk_to_be_offline/online log_mustnot zpool split $TESTPOOL $TESTPOOL2 - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable32 zfs_scan_suspend_progress 0 } log_assert "Verify 'zpool split' will fail if resilver in progress for a disk" @@ -96,15 +94,12 @@ DEVSIZE='3g' DEVICE1="$TEST_BASE_DIR/device-1" DEVICE2="$TEST_BASE_DIR/device-2" -ZFS_SCAN_VDEV_LIMIT_SLOW=$((128*1024)) -ZFS_SCAN_VDEV_LIMIT_DEFAULT=$(get_tunable zfs_scan_vdev_limit) - -log_note "Verify ZFS prevents main pool curruption during 'split'" +log_note "Verify ZFS prevents main pool corruption during 'split'" zpool_split $DEVICE1 cleanup -log_note "Verify ZFS prevents new pool curruption during 'split'" +log_note "Verify ZFS prevents new pool corruption during 'split'" zpool_split $DEVICE2 log_pass "'zpool split' failed as expected" diff --git a/tests/zfs-tests/tests/functional/removal/removal.kshlib b/tests/zfs-tests/tests/functional/removal/removal.kshlib index 7aa383585..c1ab044c7 100644 --- a/tests/zfs-tests/tests/functional/removal/removal.kshlib +++ b/tests/zfs-tests/tests/functional/removal/removal.kshlib @@ -62,7 +62,7 @@ function attempt_during_removal # pool disk callback [args] typeset callback=$3 shift 3 - set_tunable64 zfs_remove_max_bytes_pause 0 + set_tunable32 zfs_removal_suspend_progress 1 log_must zpool remove $pool $disk @@ -81,7 +81,7 @@ function attempt_during_removal # pool disk callback [args] # log_must is_pool_removing $pool - set_tunable64 zfs_remove_max_bytes_pause 18446744073709551615 + set_tunable32 zfs_removal_suspend_progress 0 log_must wait_for_removal $pool log_mustnot vdevs_in_pool $pool $disk |