summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--module/zfs/vdev_mirror.c84
-rw-r--r--module/zfs/vdev_raidz.c18
-rw-r--r--tests/runfiles/common.run3
-rwxr-xr-xtests/test-runner/bin/zts-report.py.in2
-rw-r--r--tests/zfs-tests/tests/functional/redundancy/Makefile.am3
-rwxr-xr-xtests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh (renamed from tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh)17
-rwxr-xr-xtests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh157
7 files changed, 251 insertions, 33 deletions
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 45b744b2e..977850698 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -35,6 +35,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
@@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
*/
typedef struct mirror_child {
vdev_t *mc_vd;
+ abd_t *mc_abd;
uint64_t mc_offset;
int mc_error;
int mc_load;
@@ -434,6 +436,10 @@ vdev_mirror_child_done(zio_t *zio)
{
mirror_child_t *mc = zio->io_private;
+ /* See scrubbing read comment in vdev_mirror_io_start() */
+ if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL)
+ mc->mc_abd = zio->io_abd;
+
mc->mc_error = zio->io_error;
mc->mc_tried = 1;
mc->mc_skipped = 0;
@@ -637,15 +643,16 @@ vdev_mirror_io_start(zio_t *zio)
}
if (zio->io_type == ZIO_TYPE_READ) {
- if (zio->io_bp != NULL &&
- (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
/*
- * For scrubbing reads (if we can verify the
- * checksum here, as indicated by io_bp being
- * non-NULL) we need to allocate a read buffer for
- * each child and issue reads to all children. If
- * any child succeeds, it will copy its data into
- * zio->io_data in vdev_mirror_scrub_done.
+ * For scrubbing reads we need to allocate a buffer
+ * for each child and issue reads to all children.
+ * If we can verify the checksum here, as indicated
+ * by io_bp being non-NULL, the data will be copied
+ * into zio->io_data in vdev_mirror_scrub_done().
+ * If not, then vdev_mirror_io_done() will compare
+ * all of the read buffers and return a checksum
+ * error if they aren't all identical.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -663,7 +670,8 @@ vdev_mirror_io_start(zio_t *zio)
abd_alloc_sametype(zio->io_abd,
zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
- vdev_mirror_scrub_done, mc));
+ zio->io_bp ? vdev_mirror_scrub_done :
+ vdev_mirror_child_done, mc));
}
zio_execute(zio);
return;
@@ -731,6 +739,7 @@ vdev_mirror_io_done(zio_t *zio)
int c;
int good_copies = 0;
int unexpected_errors = 0;
+ int last_good_copy = -1;
if (mm == NULL)
return;
@@ -742,6 +751,7 @@ vdev_mirror_io_done(zio_t *zio)
if (!mc->mc_skipped)
unexpected_errors++;
} else if (mc->mc_tried) {
+ last_good_copy = c;
good_copies++;
}
}
@@ -755,7 +765,6 @@ vdev_mirror_io_done(zio_t *zio)
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
- /* XXPOLICY */
if (good_copies != mm->mm_children) {
/*
* Always require at least one good copy.
@@ -782,7 +791,6 @@ vdev_mirror_io_done(zio_t *zio)
/*
* If we don't have a good copy yet, keep trying other children.
*/
- /* XXPOLICY */
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
ASSERT(c >= 0 && c < mm->mm_children);
mc = &mm->mm_child[c];
@@ -794,7 +802,59 @@ vdev_mirror_io_done(zio_t *zio)
return;
}
- /* XXPOLICY */
+ /*
+ * If we're scrubbing but don't have a BP available (because this
+ * vdev is under a raidz or draid vdev) then the best we can do is
+ * compare all of the copies read. If they're not identical then
+ * return a checksum error and the most likely correct data. The
+ * raidz code will issue a repair I/O if possible.
+ */
+ if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) {
+ abd_t *last_good_abd;
+
+ ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
+ zio->io_vd->vdev_ops == &vdev_spare_ops);
+
+ if (good_copies > 1) {
+ last_good_abd = mm->mm_child[last_good_copy].mc_abd;
+ abd_t *best_abd = NULL;
+
+ for (c = 0; c < last_good_copy; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error || !mc->mc_tried)
+ continue;
+
+ if (abd_cmp(mc->mc_abd, last_good_abd) != 0)
+ zio->io_error = SET_ERROR(ECKSUM);
+
+ /*
+ * The distributed spare is always prefered
+ * by vdev_mirror_child_select() so it's
+ * considered to be the best candidate.
+ */
+ if (best_abd == NULL &&
+ mc->mc_vd->vdev_ops ==
+ &vdev_draid_spare_ops) {
+ best_abd = mc->mc_abd;
+ }
+ }
+
+ abd_copy(zio->io_abd, best_abd ? best_abd :
+ last_good_abd, zio->io_size);
+
+ } else if (good_copies == 1) {
+ last_good_abd = mm->mm_child[last_good_copy].mc_abd;
+ abd_copy(zio->io_abd, last_good_abd, zio->io_size);
+ }
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ abd_free(mc->mc_abd);
+ mc->mc_abd = NULL;
+ }
+ }
+
if (good_copies == 0) {
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT(zio->io_error != 0);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index b14e995e3..888f9de57 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1791,6 +1791,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
+
+ if (rc->rc_force_repair)
+ unexpected_errors++;
}
/*
@@ -2155,9 +2158,20 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- if (rc->rc_error) {
- ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+ /*
+ * If scrubbing and a replacing/sparing child vdev determined
+ * that not all of its children have an identical copy of the
+ * data, then clear the error so the column is treated like
+ * any other read and force a repair to correct the damage.
+ */
+ if (rc->rc_error == ECKSUM) {
+ ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
+ vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
+ rc->rc_force_repair = 1;
+ rc->rc_error = 0;
+ }
+ if (rc->rc_error) {
if (c < rr->rr_firstdatacol)
parity_errors++;
else
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 19919a00a..709bd2533 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -747,7 +747,8 @@ tags = ['functional', 'raidz']
[tests/functional/redundancy]
tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
- 'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1',
+ 'redundancy_draid3', 'redundancy_draid_damaged1',
+ 'redundancy_draid_damaged2', 'redundancy_draid_spare1',
'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
'redundancy_raidz3', 'redundancy_stripe']
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 559e98dd0..71b0cc8d6 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -244,8 +244,6 @@ maybe = {
'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason],
'pool_checkpoint/checkpoint_discard_busy': ['FAIL', '11946'],
'projectquota/setup': ['SKIP', exec_reason],
- 'redundancy/redundancy_004_neg': ['FAIL', '7290'],
- 'redundancy/redundancy_draid_spare3': ['SKIP', known_reason],
'removal/removal_condense_export': ['FAIL', known_reason],
'reservation/reservation_008_pos': ['FAIL', '7741'],
'reservation/reservation_018_pos': ['FAIL', '5642'],
diff --git a/tests/zfs-tests/tests/functional/redundancy/Makefile.am b/tests/zfs-tests/tests/functional/redundancy/Makefile.am
index 42c11c4aa..7c1930beb 100644
--- a/tests/zfs-tests/tests/functional/redundancy/Makefile.am
+++ b/tests/zfs-tests/tests/functional/redundancy/Makefile.am
@@ -6,7 +6,8 @@ dist_pkgdata_SCRIPTS = \
redundancy_draid1.ksh \
redundancy_draid2.ksh \
redundancy_draid3.ksh \
- redundancy_draid_damaged.ksh \
+ redundancy_draid_damaged1.ksh \
+ redundancy_draid_damaged2.ksh \
redundancy_draid_spare1.ksh \
redundancy_draid_spare2.ksh \
redundancy_draid_spare3.ksh \
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
index 6796cc78a..1c1183c09 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
@@ -89,22 +89,9 @@ function test_sequential_resilver # <pool> <parity> <dir>
done
log_must zpool scrub -w $pool
+ log_must zpool status $pool
- # When only a single child was overwritten the sequential resilver
- # can fully repair the damange from parity and the scrub will have
- # nothing to repair. When multiple children are silently damaged
- # the sequential resilver will calculate the wrong data since only
- # the parity information is used and it cannot be verified with
- # the checksum. However, since only the resilvering devices are
- # written to with the bad data a subsequent scrub will be able to
- # fully repair the pool.
- #
- if [[ $nparity == 1 ]]; then
- log_must check_pool_status $pool "scan" "repaired 0B"
- else
- log_mustnot check_pool_status $pool "scan" "repaired 0B"
- fi
-
+ log_mustnot check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
}
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
new file mode 100755
index 000000000..8e06db9ba
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
@@ -0,0 +1,157 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
+
+#
+# DESCRIPTION:
+# When sequentially resilvering a dRAID pool to a distributed spare
+# silent damage to an online vdev in a replacing or spare mirror vdev
+# is not expected to be repaired. Not only does the rebuild have no
+# reason to suspect the silent damage but even if it did there's no
+# checksum available to determine the correct copy and make the repair.
+# However, the subsequent scrub should detect and repair any damage.
+#
+# STRATEGY:
+# 1. Create block device files for the test draid pool
+# 2. For each parity value [1..3]
+# a. Create a draid pool
+# b. Fill it with some directories/files
+# c. Systematically damage and replace three devices by:
+# - Overwrite the device
+# - Replace the damaged vdev with a distributed spare
+# - Scrub the pool and verify repair IO is issued
+# d. Detach the distributed spares
+# e. Scrub the pool and verify there was nothing to repair
+# f. Destroy the draid pool
+#
+
+typeset -r devs=7
+typeset -r dev_size_mb=512
+typeset -a disks
+
+prefetch_disable=$(get_tunable PREFETCH_DISABLE)
+rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
+
+function cleanup
+{
+ poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
+
+ for i in {0..$devs}; do
+ rm -f "$TEST_BASE_DIR/dev-$i"
+ done
+
+ set_tunable32 PREFETCH_DISABLE $prefetch_disable
+ set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
+}
+
+log_onexit cleanup
+
+log_must set_tunable32 PREFETCH_DISABLE 1
+log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
+
+# Disk files which will be used by pool
+for i in {0..$(($devs - 1))}; do
+ device=$TEST_BASE_DIR/dev-$i
+ log_must truncate -s ${dev_size_mb}M $device
+ disks[${#disks[*]}+1]=$device
+done
+
+# Disk file which will be attached
+log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
+
+dir=$TEST_BASE_DIR
+
+for nparity in 1 2 3; do
+ raid=draid${nparity}:3s
+
+ log_must zpool create -f -O compression=off -o cachefile=none \
+ $TESTPOOL $raid ${disks[@]}
+ # log_must zfs set primarycache=metadata $TESTPOOL
+
+ log_must zfs create $TESTPOOL/fs
+ log_must fill_fs /$TESTPOOL/fs 1 256 10 1024 R
+
+ log_must zfs create -o compress=on $TESTPOOL/fs2
+ log_must fill_fs /$TESTPOOL/fs2 1 256 10 1024 R
+
+ log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
+ log_must fill_fs /$TESTPOOL/fs3 1 256 10 1024 R
+
+ log_must zpool export $TESTPOOL
+ log_must zpool import -o cachefile=none -d $dir $TESTPOOL
+
+ log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+ for nspare in 0 1 2; do
+ damaged=$dir/dev-${nspare}
+ spare=draid${nparity}-0-${nspare}
+
+ log_must zpool export $TESTPOOL
+ log_must dd conv=notrunc if=/dev/zero of=$damaged \
+ bs=1M seek=4 count=$(($dev_size_mb-4))
+ log_must zpool import -o cachefile=none -d $dir $TESTPOOL
+
+ log_must zpool replace -fsw $TESTPOOL $damaged $spare
+
+ # Scrub the pool after the sequential resilver and verify
+ # that the silent damage was repaired by the scrub.
+ log_must zpool scrub -w $TESTPOOL
+ log_must zpool status $TESTPOOL
+ log_must check_pool_status $TESTPOOL "errors" \
+ "No known data errors"
+ log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+ log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B"
+ done
+
+ for nspare in 0 1 2; do
+ log_must check_vdev_state $TESTPOOL \
+ spare-${nspare} "ONLINE"
+ log_must check_vdev_state $TESTPOOL \
+ ${dir}/dev-${nspare} "ONLINE"
+ log_must check_vdev_state $TESTPOOL \
+ draid${nparity}-0-${nspare} "ONLINE"
+ done
+
+ # Detach the distributed spares and scrub the pool again to
+ # verify no damage remained on the originally corrupted vdevs.
+ for nspare in 0 1 2; do
+ log_must zpool detach $TESTPOOL draid${nparity}-0-${nspare}
+ done
+
+ log_must zpool clear $TESTPOOL
+ log_must zpool scrub -w $TESTPOOL
+ log_must zpool status $TESTPOOL
+
+ log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+ log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+ log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
+
+ log_must zpool destroy "$TESTPOOL"
+done
+
+log_pass "draid damaged device scrub test succeeded."