diff options
author | loli10K <[email protected]> | 2018-09-18 23:45:52 +0200 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-11-09 11:17:24 -0800 |
commit | d48091de81e5eab2aa32d7a52db4f147bd813523 (patch) | |
tree | afb0974ccc0d02287e4734d3142a900b6959758c /tests | |
parent | 13c59bb76b2f56db9f3ff6597d8a865347158e2c (diff) |
zed: detect and offline physically removed devices
This commit adds a new test case to the ZFS Test Suite to verify ZED
can detect when a device is physically removed from a running system:
the device will be offlined if a spare is not available in the pool.
We implement this by using the existing libudev functionality and
without relying solely on the FM kernel module capabilities which have
been observed to be unreliable with some kernels.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Don Brady <[email protected]>
Signed-off-by: loli10K <[email protected]>
Closes #1537
Closes #7926
Diffstat (limited to 'tests')
4 files changed, 184 insertions, 8 deletions
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index e52ab9078..e5826dd7a 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -543,10 +543,10 @@ tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/fault] -tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', - 'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple', - 'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault', - 'decompress_fault','zpool_status_-s'] +tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos', + 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_ashift', + 'auto_spare_multiple', 'auto_spare_shared', 'scrub_after_resilver', + 'decrypt_fault', 'decompress_fault', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/async_destroy] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index 289e3e33f..f39e6267b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -96,8 +96,7 @@ for type in " " mirror raidz raidz2; do fi typeset prev_size=$(get_pool_prop size $TESTPOOL1) - typeset zfs_prev_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \ - awk '{print $3}') + typeset zfs_prev_size=$(get_prop avail $TESTPOOL1) # Expand each device as appropriate being careful to add an artificial # delay to ensure we get a single history entry for each. This makes @@ -117,8 +116,7 @@ for type in " " mirror raidz raidz2; do log_must zpool online -e $TESTPOOL1 $FILE_RAW typeset expand_size=$(get_pool_prop size $TESTPOOL1) - typeset zfs_expand_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \ - awk '{print $3}') + typeset zfs_expand_size=$(get_prop avail $TESTPOOL1) log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ "expanded size: $expand_size" diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index 5c68ea26f..f2fc06877 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -2,6 +2,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/fault dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + auto_offline_001_pos.ksh \ auto_online_001_pos.ksh \ auto_replace_001_pos.ksh \ auto_spare_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh new file mode 100755 index 000000000..bd0fd4c87 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -0,0 +1,177 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2018, loli10K <[email protected]>. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/events/events_common.kshlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Physically removed device is +# offlined and onlined when reattached +# +# STRATEGY: +# 1. Create a pool +# 2. Simulate physical removal of one device +# 3. Verify the device is offlined +# 4. Reattach the device +# 5. Verify the device is onlined +# 6. Repeat the same tests with a spare device: zed will use the spare to handle +# the removed data device +# 7. Repeat the same tests again with a faulted spare device: zed should offline +# the removed data device if no spare is available +# +# NOTE: the use of 'block_device_wait' throughout the test helps avoid race +# conditions caused by mixing creation/removal events from partitioning the +# disk (zpool create) and events from physically removing it (remove_disk). +# +verify_runnable "both" + +if is_linux; then + # Add one 512b scsi_debug device (4Kn would generate IO errors) + # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: + # add 32m of fudge + load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' +else + log_unsupported "scsi debug module unsupported" +fi + +function cleanup +{ + destroy_pool $TESTPOOL + rm -f $filedev1 + rm -f $filedev2 + rm -f $filedev3 + rm -f $sparedev + unload_scsi_debug +} + +log_assert "ZED detects physically removed devices" + +log_onexit cleanup + +filedev1="$TEST_BASE_DIR/file-vdev-1" +filedev2="$TEST_BASE_DIR/file-vdev-2" +filedev3="$TEST_BASE_DIR/file-vdev-3" +sparedev="$TEST_BASE_DIR/file-vdev-spare" +removedev=$(get_debug_device) + +typeset poolconfs=("mirror $filedev1 $removedev" + "raidz $filedev1 $removedev" + "raidz2 $filedev1 $filedev2 $removedev" + "raidz3 $filedev1 $filedev2 $filedev3 $removedev" + "$filedev1 cache $removedev" + "mirror $filedev1 $filedev2 cache $removedev" + "raidz $filedev1 $filedev2 $filedev3 cache $removedev" +) + +log_must truncate -s $SPA_MINDEVSIZE $filedev1 +log_must truncate -s $SPA_MINDEVSIZE $filedev2 +log_must truncate -s $SPA_MINDEVSIZE $filedev3 +log_must truncate -s $SPA_MINDEVSIZE $sparedev + +for conf in "${poolconfs[@]}" +do + # 1. Create a pool + log_must zpool create -f $TESTPOOL $conf + block_device_wait + + # 2. Simulate physical removal of one device + remove_disk $removedev + + # 3. Verify the device is offlined + log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + + # 4. Reattach the device + insert_disk $removedev + + # 5. Verify the device is onlined + log_must wait_vdev_state $TESTPOOL $removedev "ONLINE" + + # cleanup + destroy_pool $TESTPOOL + log_must parted "/dev/${removedev}" -s -- mklabel msdos + block_device_wait +done + +# 6. Repeat the same tests with a spare device: zed will use the spare to handle +# the removed data device +for conf in "${poolconfs[@]}" +do + # 1. Create a pool with a spare + log_must zpool create -f $TESTPOOL $conf + block_device_wait + log_must zpool add $TESTPOOL spare $sparedev + + # 3. Simulate physical removal of one device + remove_disk $removedev + + # 4. Verify the device is handled by the spare unless is a l2arc disk + # which can only be offlined + if [[ $(echo "$conf" | grep -c 'cache') -eq 0 ]]; then + log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE" + else + log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + fi + + # 5. Reattach the device + insert_disk $removedev + + # 6. Verify the device is onlined + log_must wait_vdev_state $TESTPOOL $removedev "ONLINE" + + # cleanup + destroy_pool $TESTPOOL + log_must parted "/dev/${removedev}" -s -- mklabel msdos + block_device_wait +done + +# 7. Repeat the same tests again with a faulted spare device: zed should offline +# the removed data device if no spare is available +for conf in "${poolconfs[@]}" +do + # 1. Create a pool with a spare + log_must zpool create -f $TESTPOOL $conf + block_device_wait + log_must zpool add $TESTPOOL spare $sparedev + + # 2. Fault the spare device making it unavailable + log_must zpool offline -f $TESTPOOL $sparedev + log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED" + + # 3. Simulate physical removal of one device + remove_disk $removedev + + # 4. Verify the device is offlined + log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + + # 5. Reattach the device + insert_disk $removedev + + # 6. Verify the device is onlined + log_must wait_vdev_state $TESTPOOL $removedev "ONLINE" + + # cleanup + destroy_pool $TESTPOOL + log_must parted "/dev/${removedev}" -s -- mklabel msdos + block_device_wait +done + +log_pass "ZED detects physically removed devices" |