summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--man/man5/zfs-module-parameters.516
-rw-r--r--man/man8/zpool.87
-rw-r--r--module/zfs/vdev_removal.c100
-rw-r--r--tests/runfiles/linux.run6
-rw-r--r--tests/zfs-tests/include/libtest.shlib22
-rw-r--r--tests/zfs-tests/tests/functional/removal/Makefile.am3
-rw-r--r--tests/zfs-tests/tests/functional/removal/removal.kshlib2
-rwxr-xr-xtests/zfs-tests/tests/functional/removal/removal_with_errors.ksh109
-rwxr-xr-xtests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh104
9 files changed, 350 insertions, 19 deletions
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 72a850f4f..837a7d1c3 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1985,6 +1985,22 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp
.ne 2
.na
+\fBzfs_removal_ignore_errors\fR (int)
+.ad
+.RS 12n
+.sp
+Ignore hard IO errors during device removal. When set, if a device encounters
+a hard IO error during the removal process the removal will not be cancelled.
+This can result in a normally recoverable block becoming permanently damaged
+and is not recommended. This should only be used as a last resort when the
+pool cannot be returned to a healthy state prior to removing the device.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_resilver_min_time_ms\fR (int)
.ad
.RS 12n
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index b9e0e1ad4..83e50bd01 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
-.Dd April 27, 2018
+.Dd November 29, 2018
.Dt ZPOOL 8 SMM
.Os Linux
.Sh NAME
@@ -1942,8 +1942,9 @@ In this case, the
command initiates the removal and returns, while the evacuation continues in
the background.
The removal progress can be monitored with
-.Nm zpool Cm status.
-The
+.Nm zpool Cm status .
+If an IO error is encountered during the removal process it will be
+cancelled. The
.Sy device_removal
feature flag must be enabled to remove a top-level vdev, see
.Xr zpool-features 5 .
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 49b9ed3a1..5952a5d8f 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -80,6 +80,8 @@
typedef struct vdev_copy_arg {
metaslab_t *vca_msp;
uint64_t vca_outstanding_bytes;
+ uint64_t vca_read_error_bytes;
+ uint64_t vca_write_error_bytes;
kcondvar_t vca_cv;
kmutex_t vca_lock;
} vdev_copy_arg_t;
@@ -100,6 +102,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
/*
+ * Ignore hard IO errors during device removal. When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled. This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
* Allow a remap segment to span free chunks of at most this size. The main
* impact of a larger span is that we will read and write larger, more
* contiguous chunks, with more "unnecessary" data -- trading off bandwidth
@@ -126,6 +136,7 @@ int zfs_removal_suspend_progress = 0;
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
static void
spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
@@ -802,6 +813,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;
+
+ if (zio->io_error != 0)
+ vca->vca_write_error_bytes += zio->io_size;
+
cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
}
@@ -813,6 +828,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
+ vdev_copy_arg_t *vca = zio->io_private;
+
+ if (zio->io_error != 0) {
+ mutex_enter(&vca->vca_lock);
+ vca->vca_read_error_bytes += zio->io_size;
+ mutex_exit(&vca->vca_lock);
+ }
+
zio_nowait(zio_unique_parent(zio));
}
@@ -866,25 +889,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
{
ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+ /*
+ * If the destination child in unwritable then there is no point
+ * in issuing the source reads which cannot be written.
+ */
+ if (!vdev_writeable(dest_child_vd))
+ return;
+
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes += size;
mutex_exit(&vca->vca_lock);
abd_t *abd = abd_alloc_for_io(size, B_FALSE);
- vdev_t *source_child_vd;
+ vdev_t *source_child_vd = NULL;
if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
/*
* Source and dest are both mirrors. Copy from the same
* child id as we are copying to (wrapping around if there
- * are more dest children than source children).
+ * are more dest children than source children). If the
+ * preferred source child is unreadable select another.
*/
- source_child_vd =
- source_vd->vdev_child[dest_id % source_vd->vdev_children];
+ for (int i = 0; i < source_vd->vdev_children; i++) {
+ source_child_vd = source_vd->vdev_child[
+ (dest_id + i) % source_vd->vdev_children];
+ if (vdev_readable(source_child_vd))
+ break;
+ }
} else {
source_child_vd = source_vd;
}
+ /*
+ * There should always be at least one readable source child or
+ * the pool would be in a suspended state. Somehow selecting an
+ * unreadable child would result in IO errors, the removal process
+ * being cancelled, and the pool reverting to its pre-removal state.
+ */
+ ASSERT3P(source_child_vd, !=, NULL);
+
zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
dest_child_vd, dest_offset, abd, size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
@@ -1361,6 +1404,8 @@ spa_vdev_remove_thread(void *arg)
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
vca.vca_outstanding_bytes = 0;
+ vca.vca_read_error_bytes = 0;
+ vca.vca_write_error_bytes = 0;
mutex_enter(&svr->svr_lock);
@@ -1490,6 +1535,14 @@ spa_vdev_remove_thread(void *arg)
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
}
+
+ mutex_enter(&vca.vca_lock);
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ svr->svr_thread_exit = B_TRUE;
+ }
+ mutex_exit(&vca.vca_lock);
}
mutex_exit(&svr->svr_lock);
@@ -1511,6 +1564,21 @@ spa_vdev_remove_thread(void *arg)
svr->svr_thread = NULL;
cv_broadcast(&svr->svr_cv);
mutex_exit(&svr->svr_lock);
+
+ /*
+ * During the removal process an unrecoverable read or write
+ * error was encountered. The removal process must be
+ * cancelled or this damage may become permanent.
+ */
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ zfs_dbgmsg("canceling removal due to IO errors: "
+ "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+ vca.vca_read_error_bytes,
+ vca.vca_write_error_bytes);
+ spa_vdev_remove_cancel_impl(spa);
+ }
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
vdev_remove_complete(spa);
@@ -1689,14 +1757,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
}
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
{
- spa_vdev_remove_suspend(spa);
-
- if (spa->spa_vdev_removal == NULL)
- return (ENOTACTIVE);
-
uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
@@ -1713,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa)
return (error);
}
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+ spa_vdev_remove_suspend(spa);
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ENOTACTIVE);
+
+ return (spa_vdev_remove_cancel_impl(spa));
+}
+
/*
* Called every sync pass of every txg if there's a svr.
*/
@@ -2162,6 +2236,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
}
#if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+ "Ignore hard IO errors when removing device");
+
module_param(zfs_remove_max_segment, int, 0644);
MODULE_PARM_DESC(zfs_remove_max_segment,
"Largest contiguous segment to allocate when removing device");
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index a4c6769db..3dd9656f0 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -728,9 +728,9 @@ tests = ['removal_all_vdev', 'removal_check_space',
'removal_remap', 'removal_remap_deadlists',
'removal_resume_export', 'removal_sanity', 'removal_with_add',
'removal_with_create_fs', 'removal_with_dedup',
- 'removal_with_export', 'removal_with_ganging',
- 'removal_with_remap', 'removal_with_remove',
- 'removal_with_scrub', 'removal_with_send',
+ 'removal_with_errors', 'removal_with_export',
+ 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remap',
+ 'removal_with_remove', 'removal_with_scrub', 'removal_with_send',
'removal_with_send_recv', 'removal_with_snapshot',
'removal_with_write', 'removal_with_zdb', 'remove_expanded',
'remove_mirror', 'remove_mirror_sanity', 'remove_raidz']
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index e66b2cbaf..2effa4207 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -1932,6 +1932,23 @@ function verify_filesys # pool filesystem dir
}
#
+# Given a pool issue a scrub and verify that no checksum errors are reported.
+#
+function verify_pool
+{
+ typeset pool=${1:-$TESTPOOL}
+
+ log_must zpool scrub $pool
+ log_must wait_scrubbed $pool
+
+ cksum=$(zpool status $pool | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
+ if [[ $cksum != 0 ]]; then
+ log_must zpool status -v
+ log_fail "Unexpected CKSUM errors found on $pool ($cksum)"
+ fi
+}
+
+#
# Given a pool, and this function list all disks in the pool
#
function get_disklist # pool
@@ -3025,8 +3042,11 @@ function vdevs_in_pool
shift
+ # We could use 'zpool list' to only get the vdevs of the pool but we
+ # can't reference a mirror/raidz vdev using its ID (i.e mirror-0),
+ # therefore we use the 'zpool status' output.
typeset tmpfile=$(mktemp)
- zpool list -Hv "$pool" >$tmpfile
+ zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile
for vdev in $@; do
grep -w ${vdev##*/} $tmpfile >/dev/null 2>&1
[[ $? -ne 0 ]] && return 1
diff --git a/tests/zfs-tests/tests/functional/removal/Makefile.am b/tests/zfs-tests/tests/functional/removal/Makefile.am
index c2b333a00..c5d013e7c 100644
--- a/tests/zfs-tests/tests/functional/removal/Makefile.am
+++ b/tests/zfs-tests/tests/functional/removal/Makefile.am
@@ -21,7 +21,8 @@ dist_pkgdata_SCRIPTS = \
removal_remap_deadlists.ksh removal_remap.ksh \
removal_reservation.ksh removal_resume_export.ksh \
removal_sanity.ksh removal_with_add.ksh removal_with_create_fs.ksh \
- removal_with_dedup.ksh removal_with_export.ksh \
+ removal_with_dedup.ksh removal_with_errors.ksh \
+ removal_with_export.ksh removal_with_faulted.ksh \
removal_with_ganging.ksh removal_with_remap.ksh \
removal_with_remove.ksh removal_with_scrub.ksh \
removal_with_send.ksh removal_with_send_recv.ksh \
diff --git a/tests/zfs-tests/tests/functional/removal/removal.kshlib b/tests/zfs-tests/tests/functional/removal/removal.kshlib
index c1ab044c7..fa0174db0 100644
--- a/tests/zfs-tests/tests/functional/removal/removal.kshlib
+++ b/tests/zfs-tests/tests/functional/removal/removal.kshlib
@@ -141,6 +141,8 @@ function test_removal_with_operation # callback [args]
kill $killpid
wait
+
+ verify_pool $TESTPOOL
}
#
diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh
new file mode 100755
index 000000000..2ef56706a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh
@@ -0,0 +1,109 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/removal/removal.kshlib
+
+#
+# DESCRIPTION:
+#
+# This test ensures the device removal is cancelled when hard IO
+# errors are encountered during the removal process. This is done
+# to ensure that when removing a device all of the data is copied.
+#
+# STRATEGY:
+#
+# 1. We create a pool with enough redundancy such that IO errors
+# will not result in the pool being suspended.
+# 2. We write some test data to the pool.
+# 3. We inject READ errors in to one half of the top-level mirror-0
+# vdev which is being removed. Then we start the removal process.
+# 4. Verify that the injected read errors cause the removal of
+# mirror-0 to be cancelled and that mirror-0 has not been removed.
+# 5. Clear the read fault injection.
+# 6. Repeat steps 3-6 above except inject WRITE errors on one of
+# child vdevs in the destination mirror-1.
+# 7. Lastly verify the pool data is still intact.
+#
+
+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
+DISK0=$TMPDIR/dsk0
+DISK1=$TMPDIR/dsk1
+DISK2=$TMPDIR/dsk2
+DISK3=$TMPDIR/dsk3
+
+log_must truncate -s $MINVDEVSIZE $DISK0 $DISK1
+log_must truncate -s $((MINVDEVSIZE * 4)) $DISK2 $DISK3
+
+function cleanup
+{
+ log_must zinject -c all
+ default_cleanup_noexit
+ log_must rm -f $DISK0 $DISK1 $DISK2 $DISK3
+}
+
+function wait_for_removing_cancel
+{
+ typeset pool=$1
+
+ while is_pool_removing $pool; do
+ sleep 1
+ done
+
+ #
+ # The pool state changes before the TXG finishes syncing; wait for
+ # the removal to be completed on disk.
+ #
+ sync_pool
+
+ log_mustnot is_pool_removed $pool
+ return 0
+}
+
+default_setup_noexit "mirror $DISK0 $DISK1 mirror $DISK2 $DISK3"
+log_onexit cleanup
+
+FILE_CONTENTS="Leeloo Dallas mul-ti-pass."
+
+echo $FILE_CONTENTS >$TESTDIR/$TESTFILE0
+log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ]
+log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**7))
+sync_pool $TESTPOOL
+
+# Verify that unexpected read errors automatically cancel the removal.
+log_must zinject -d $DISK0 -e io -T all -f 100 $TESTPOOL
+log_must zpool remove $TESTPOOL mirror-0
+log_must wait_for_removing_cancel $TESTPOOL
+log_must vdevs_in_pool $TESTPOOL mirror-0
+log_must zinject -c all
+
+# Verify that unexpected write errors automatically cancel the removal.
+log_must zinject -d $DISK3 -e io -T all -f 100 $TESTPOOL
+log_must zpool remove $TESTPOOL mirror-0
+log_must wait_for_removing_cancel $TESTPOOL
+log_must vdevs_in_pool $TESTPOOL mirror-0
+log_must zinject -c all
+
+log_must dd if=$TESTDIR/$TESTFILE0 of=/dev/null
+log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ]
+log_must dd if=$TESTDIR/$TESTFILE1 of=/dev/null
+
+log_pass "Device not removed due to unexpected errors."
diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh
new file mode 100755
index 000000000..44d222860
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh
@@ -0,0 +1,104 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/removal/removal.kshlib
+
+#
+# DESCRIPTION:
+#
+# This test ensures that even when child vdevs are unavailable the
+# device removal process copies from readable source children to
+# writable destination children. This may be different than the
+# default mapping which preferentially pairs up source and destination
+# child vdevs based on their child ids.
+#
+# Default Mapping:
+# mirror-0 mirror-1
+# DISK0 (child 0) ------> DISK2 (child 0)
+# DISK1 (child 1) ------> DISK3 (child 1)
+#
+# We want to setup a scenario where the default mapping would make
+# it impossible to copy any data during the removal process. This
+# is done by faulting both the mirror-0 (child 0) source vdev and
+# mirror-1 (child 1) destination vdev. As shown below the default
+# mapping cannot be used due to the FAULTED vdevs. Verify that an
+# alternate mapping is selected and all the readable data is copied.
+#
+# Default Mapping (BAD):
+# mirror-0 mirror-1
+# DISK0 (FAULTED) ------> DISK2
+# DISK1 ----------------> DISK3 (FAULTED)
+#
+# Required Mapping (GOOD):
+# mirror-0 mirror-1
+# DISK0 (FAULTED) +---> DISK2
+# DISK1 ------------+ DISK3 (FAULTED)
+#
+# STRATEGY:
+#
+# 1. We create a pool with two top-level mirror vdevs.
+# 2. We write some test data to the pool.
+# 3. We fault two children to force the scenario described above.
+# 4. We remove the mirror-0 device.
+# 5. We verify that the device has been removed and that all of the
+# data is still intact.
+#
+
+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
+DISK0=$TMPDIR/dsk0
+DISK1=$TMPDIR/dsk1
+DISK2=$TMPDIR/dsk2
+DISK3=$TMPDIR/dsk3
+
+log_must truncate -s $MINVDEVSIZE $DISK0 $DISK1
+log_must truncate -s $((MINVDEVSIZE * 4)) $DISK2 $DISK3
+
+function cleanup
+{
+ default_cleanup_noexit
+ log_must rm -f $DISK0 $DISK1 $DISK2 $DISK3
+}
+
+default_setup_noexit "mirror $DISK0 $DISK1 mirror $DISK2 $DISK3"
+log_onexit cleanup
+
+log_must zpool offline -f $TESTPOOL $DISK0
+log_must zpool offline -f $TESTPOOL $DISK3
+
+FILE_CONTENTS="Leeloo Dallas mul-ti-pass."
+
+echo $FILE_CONTENTS >$TESTDIR/$TESTFILE0
+log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ]
+log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**7))
+sync_pool $TESTPOOL
+
+log_must zpool remove $TESTPOOL mirror-0
+log_must wait_for_removal $TESTPOOL
+log_mustnot vdevs_in_pool $TESTPOOL mirror-0
+
+verify_pool $TESTPOOL
+
+log_must dd if=$TESTDIR/$TESTFILE0 of=/dev/null
+log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ]
+log_must dd if=$TESTDIR/$TESTFILE1 of=/dev/null
+
+log_pass "Can remove with faulted vdevs"