diff options
-rw-r--r-- | man/man5/zfs-module-parameters.5 | 16 | ||||
-rw-r--r-- | man/man8/zpool.8 | 7 | ||||
-rw-r--r-- | module/zfs/vdev_removal.c | 100 | ||||
-rw-r--r-- | tests/runfiles/linux.run | 6 | ||||
-rw-r--r-- | tests/zfs-tests/include/libtest.shlib | 22 | ||||
-rw-r--r-- | tests/zfs-tests/tests/functional/removal/Makefile.am | 3 | ||||
-rw-r--r-- | tests/zfs-tests/tests/functional/removal/removal.kshlib | 2 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh | 109 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh | 104 |
9 files changed, 350 insertions, 19 deletions
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 72a850f4f..837a7d1c3 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1985,6 +1985,22 @@ Use \fB1\fR for yes and \fB0\fR for no (default). .sp .ne 2 .na +\fBzfs_removal_ignore_errors\fR (int) +.ad +.RS 12n +.sp +Ignore hard IO errors during device removal. When set, if a device encounters +a hard IO error during the removal process the removal will not be cancelled. +This can result in a normally recoverable block becoming permanently damaged +and is not recommended. This should only be used as a last resort when the +pool cannot be returned to a healthy state prior to removing the device. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na \fBzfs_resilver_min_time_ms\fR (int) .ad .RS 12n diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index b9e0e1ad4..83e50bd01 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd April 27, 2018 +.Dd November 29, 2018 .Dt ZPOOL 8 SMM .Os Linux .Sh NAME @@ -1942,8 +1942,9 @@ In this case, the command initiates the removal and returns, while the evacuation continues in the background. The removal progress can be monitored with -.Nm zpool Cm status. -The +.Nm zpool Cm status . +If an IO error is encountered during the removal process it will be +cancelled. The .Sy device_removal feature flag must be enabled to remove a top-level vdev, see .Xr zpool-features 5 . diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 49b9ed3a1..5952a5d8f 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -80,6 +80,8 @@ typedef struct vdev_copy_arg { metaslab_t *vca_msp; uint64_t vca_outstanding_bytes; + uint64_t vca_read_error_bytes; + uint64_t vca_write_error_bytes; kcondvar_t vca_cv; kmutex_t vca_lock; } vdev_copy_arg_t; @@ -100,6 +102,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* + * Ignore hard IO errors during device removal. When set if a device + * encounters hard IO error during the removal process the removal will + * not be cancelled. This can result in a normally recoverable block + * becoming permanently damaged and is not recommended. + */ +int zfs_removal_ignore_errors = 0; + +/* * Allow a remap segment to span free chunks of at most this size. The main * impact of a larger span is that we will read and write larger, more * contiguous chunks, with more "unnecessary" data -- trading off bandwidth @@ -126,6 +136,7 @@ int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" static void spa_vdev_remove_thread(void *arg); +static int spa_vdev_remove_cancel_impl(spa_t *spa); static void spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) @@ -802,6 +813,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio) mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; + + if (zio->io_error != 0) + vca->vca_write_error_bytes += zio->io_size; + cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); } @@ -813,6 +828,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio) static void spa_vdev_copy_segment_read_done(zio_t *zio) { + vdev_copy_arg_t *vca = zio->io_private; + + if (zio->io_error != 0) { + mutex_enter(&vca->vca_lock); + vca->vca_read_error_bytes += zio->io_size; + mutex_exit(&vca->vca_lock); + } + zio_nowait(zio_unique_parent(zio)); } @@ -866,25 +889,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, { ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); + /* + * If the destination child in unwritable then there is no point + * in issuing the source reads which cannot be written. + */ + if (!vdev_writeable(dest_child_vd)) + return; + mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes += size; mutex_exit(&vca->vca_lock); abd_t *abd = abd_alloc_for_io(size, B_FALSE); - vdev_t *source_child_vd; + vdev_t *source_child_vd = NULL; if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { /* * Source and dest are both mirrors. Copy from the same * child id as we are copying to (wrapping around if there - * are more dest children than source children). + * are more dest children than source children). If the + * preferred source child is unreadable select another. */ - source_child_vd = - source_vd->vdev_child[dest_id % source_vd->vdev_children]; + for (int i = 0; i < source_vd->vdev_children; i++) { + source_child_vd = source_vd->vdev_child[ + (dest_id + i) % source_vd->vdev_children]; + if (vdev_readable(source_child_vd)) + break; + } } else { source_child_vd = source_vd; } + /* + * There should always be at least one readable source child or + * the pool would be in a suspended state. Somehow selecting an + * unreadable child would result in IO errors, the removal process + * being cancelled, and the pool reverting to its pre-removal state. + */ + ASSERT3P(source_child_vd, !=, NULL); + zio_t *write_zio = zio_vdev_child_io(nzio, NULL, dest_child_vd, dest_offset, abd, size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, @@ -1361,6 +1404,8 @@ spa_vdev_remove_thread(void *arg) mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); vca.vca_outstanding_bytes = 0; + vca.vca_read_error_bytes = 0; + vca.vca_write_error_bytes = 0; mutex_enter(&svr->svr_lock); @@ -1490,6 +1535,14 @@ spa_vdev_remove_thread(void *arg) dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); } + + mutex_enter(&vca.vca_lock); + if (zfs_removal_ignore_errors == 0 && + (vca.vca_read_error_bytes > 0 || + vca.vca_write_error_bytes > 0)) { + svr->svr_thread_exit = B_TRUE; + } + mutex_exit(&vca.vca_lock); } mutex_exit(&svr->svr_lock); @@ -1511,6 +1564,21 @@ spa_vdev_remove_thread(void *arg) svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); + + /* + * During the removal process an unrecoverable read or write + * error was encountered. The removal process must be + * cancelled or this damage may become permanent. + */ + if (zfs_removal_ignore_errors == 0 && + (vca.vca_read_error_bytes > 0 || + vca.vca_write_error_bytes > 0)) { + zfs_dbgmsg("canceling removal due to IO errors: " + "[read_error_bytes=%llu] [write_error_bytes=%llu]", + vca.vca_read_error_bytes, + vca.vca_write_error_bytes); + spa_vdev_remove_cancel_impl(spa); + } } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); @@ -1689,14 +1757,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); } -int -spa_vdev_remove_cancel(spa_t *spa) +static int +spa_vdev_remove_cancel_impl(spa_t *spa) { - spa_vdev_remove_suspend(spa); - - if (spa->spa_vdev_removal == NULL) - return (ENOTACTIVE); - uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, @@ -1713,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa) return (error); } +int +spa_vdev_remove_cancel(spa_t *spa) +{ + spa_vdev_remove_suspend(spa); + + if (spa->spa_vdev_removal == NULL) + return (ENOTACTIVE); + + return (spa_vdev_remove_cancel_impl(spa)); +} + /* * Called every sync pass of every txg if there's a svr. */ @@ -2162,6 +2236,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) } #if defined(_KERNEL) +module_param(zfs_removal_ignore_errors, int, 0644); +MODULE_PARM_DESC(zfs_removal_ignore_errors, + "Ignore hard IO errors when removing device"); + module_param(zfs_remove_max_segment, int, 0644); MODULE_PARM_DESC(zfs_remove_max_segment, "Largest contiguous segment to allocate when removing device"); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index a4c6769db..3dd9656f0 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -728,9 +728,9 @@ tests = ['removal_all_vdev', 'removal_check_space', 'removal_remap', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', - 'removal_with_export', 'removal_with_ganging', - 'removal_with_remap', 'removal_with_remove', - 'removal_with_scrub', 'removal_with_send', + 'removal_with_errors', 'removal_with_export', + 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remap', + 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index e66b2cbaf..2effa4207 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1932,6 +1932,23 @@ function verify_filesys # pool filesystem dir } # +# Given a pool issue a scrub and verify that no checksum errors are reported. +# +function verify_pool +{ + typeset pool=${1:-$TESTPOOL} + + log_must zpool scrub $pool + log_must wait_scrubbed $pool + + cksum=$(zpool status $pool | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') + if [[ $cksum != 0 ]]; then + log_must zpool status -v + log_fail "Unexpected CKSUM errors found on $pool ($cksum)" + fi +} + +# # Given a pool, and this function list all disks in the pool # function get_disklist # pool @@ -3025,8 +3042,11 @@ function vdevs_in_pool shift + # We could use 'zpool list' to only get the vdevs of the pool but we + # can't reference a mirror/raidz vdev using its ID (i.e mirror-0), + # therefore we use the 'zpool status' output. typeset tmpfile=$(mktemp) - zpool list -Hv "$pool" >$tmpfile + zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile for vdev in $@; do grep -w ${vdev##*/} $tmpfile >/dev/null 2>&1 [[ $? -ne 0 ]] && return 1 diff --git a/tests/zfs-tests/tests/functional/removal/Makefile.am b/tests/zfs-tests/tests/functional/removal/Makefile.am index c2b333a00..c5d013e7c 100644 --- a/tests/zfs-tests/tests/functional/removal/Makefile.am +++ b/tests/zfs-tests/tests/functional/removal/Makefile.am @@ -21,7 +21,8 @@ dist_pkgdata_SCRIPTS = \ removal_remap_deadlists.ksh removal_remap.ksh \ removal_reservation.ksh removal_resume_export.ksh \ removal_sanity.ksh removal_with_add.ksh removal_with_create_fs.ksh \ - removal_with_dedup.ksh removal_with_export.ksh \ + removal_with_dedup.ksh removal_with_errors.ksh \ + removal_with_export.ksh removal_with_faulted.ksh \ removal_with_ganging.ksh removal_with_remap.ksh \ removal_with_remove.ksh removal_with_scrub.ksh \ removal_with_send.ksh removal_with_send_recv.ksh \ diff --git a/tests/zfs-tests/tests/functional/removal/removal.kshlib b/tests/zfs-tests/tests/functional/removal/removal.kshlib index c1ab044c7..fa0174db0 100644 --- a/tests/zfs-tests/tests/functional/removal/removal.kshlib +++ b/tests/zfs-tests/tests/functional/removal/removal.kshlib @@ -141,6 +141,8 @@ function test_removal_with_operation # callback [args] kill $killpid wait + + verify_pool $TESTPOOL } # diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh new file mode 100755 index 000000000..2ef56706a --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh @@ -0,0 +1,109 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2014, 2017 by Delphix. All rights reserved. +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# +# This test ensures the device removal is cancelled when hard IO +# errors are encountered during the removal process. This is done +# to ensure that when removing a device all of the data is copied. +# +# STRATEGY: +# +# 1. We create a pool with enough redundancy such that IO errors +# will not result in the pool being suspended. +# 2. We write some test data to the pool. +# 3. We inject READ errors in to one half of the top-level mirror-0 +# vdev which is being removed. Then we start the removal process. +# 4. Verify that the injected read errors cause the removal of +# mirror-0 to be cancelled and that mirror-0 has not been removed. +# 5. Clear the read fault injection. +# 6. Repeat steps 3-6 above except inject WRITE errors on one of +# child vdevs in the destination mirror-1. +# 7. Lastly verify the pool data is still intact. +# + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} +DISK0=$TMPDIR/dsk0 +DISK1=$TMPDIR/dsk1 +DISK2=$TMPDIR/dsk2 +DISK3=$TMPDIR/dsk3 + +log_must truncate -s $MINVDEVSIZE $DISK0 $DISK1 +log_must truncate -s $((MINVDEVSIZE * 4)) $DISK2 $DISK3 + +function cleanup +{ + log_must zinject -c all + default_cleanup_noexit + log_must rm -f $DISK0 $DISK1 $DISK2 $DISK3 +} + +function wait_for_removing_cancel +{ + typeset pool=$1 + + while is_pool_removing $pool; do + sleep 1 + done + + # + # The pool state changes before the TXG finishes syncing; wait for + # the removal to be completed on disk. + # + sync_pool + + log_mustnot is_pool_removed $pool + return 0 +} + +default_setup_noexit "mirror $DISK0 $DISK1 mirror $DISK2 $DISK3" +log_onexit cleanup + +FILE_CONTENTS="Leeloo Dallas mul-ti-pass." + +echo $FILE_CONTENTS >$TESTDIR/$TESTFILE0 +log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ] +log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**7)) +sync_pool $TESTPOOL + +# Verify that unexpected read errors automatically cancel the removal. +log_must zinject -d $DISK0 -e io -T all -f 100 $TESTPOOL +log_must zpool remove $TESTPOOL mirror-0 +log_must wait_for_removing_cancel $TESTPOOL +log_must vdevs_in_pool $TESTPOOL mirror-0 +log_must zinject -c all + +# Verify that unexpected write errors automatically cancel the removal. +log_must zinject -d $DISK3 -e io -T all -f 100 $TESTPOOL +log_must zpool remove $TESTPOOL mirror-0 +log_must wait_for_removing_cancel $TESTPOOL +log_must vdevs_in_pool $TESTPOOL mirror-0 +log_must zinject -c all + +log_must dd if=$TESTDIR/$TESTFILE0 of=/dev/null +log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ] +log_must dd if=$TESTDIR/$TESTFILE1 of=/dev/null + +log_pass "Device not removed due to unexpected errors." diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh new file mode 100755 index 000000000..44d222860 --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh @@ -0,0 +1,104 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2014, 2017 by Delphix. All rights reserved. +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# +# This test ensures that even when child vdevs are unavailable the +# device removal process copies from readable source children to +# writable destination children. This may be different than the +# default mapping which preferentially pairs up source and destination +# child vdevs based on their child ids. +# +# Default Mapping: +# mirror-0 mirror-1 +# DISK0 (child 0) ------> DISK2 (child 0) +# DISK1 (child 1) ------> DISK3 (child 1) +# +# We want to setup a scenario where the default mapping would make +# it impossible to copy any data during the removal process. This +# is done by faulting both the mirror-0 (child 0) source vdev and +# mirror-1 (child 1) destination vdev. As shown below the default +# mapping cannot be used due to the FAULTED vdevs. Verify that an +# alternate mapping is selected and all the readable data is copied. +# +# Default Mapping (BAD): +# mirror-0 mirror-1 +# DISK0 (FAULTED) ------> DISK2 +# DISK1 ----------------> DISK3 (FAULTED) +# +# Required Mapping (GOOD): +# mirror-0 mirror-1 +# DISK0 (FAULTED) +---> DISK2 +# DISK1 ------------+ DISK3 (FAULTED) +# +# STRATEGY: +# +# 1. We create a pool with two top-level mirror vdevs. +# 2. We write some test data to the pool. +# 3. We fault two children to force the scenario described above. +# 4. We remove the mirror-0 device. +# 5. We verify that the device has been removed and that all of the +# data is still intact. +# + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} +DISK0=$TMPDIR/dsk0 +DISK1=$TMPDIR/dsk1 +DISK2=$TMPDIR/dsk2 +DISK3=$TMPDIR/dsk3 + +log_must truncate -s $MINVDEVSIZE $DISK0 $DISK1 +log_must truncate -s $((MINVDEVSIZE * 4)) $DISK2 $DISK3 + +function cleanup +{ + default_cleanup_noexit + log_must rm -f $DISK0 $DISK1 $DISK2 $DISK3 +} + +default_setup_noexit "mirror $DISK0 $DISK1 mirror $DISK2 $DISK3" +log_onexit cleanup + +log_must zpool offline -f $TESTPOOL $DISK0 +log_must zpool offline -f $TESTPOOL $DISK3 + +FILE_CONTENTS="Leeloo Dallas mul-ti-pass." + +echo $FILE_CONTENTS >$TESTDIR/$TESTFILE0 +log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ] +log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**7)) +sync_pool $TESTPOOL + +log_must zpool remove $TESTPOOL mirror-0 +log_must wait_for_removal $TESTPOOL +log_mustnot vdevs_in_pool $TESTPOOL mirror-0 + +verify_pool $TESTPOOL + +log_must dd if=$TESTDIR/$TESTFILE0 of=/dev/null +log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ] +log_must dd if=$TESTDIR/$TESTFILE1 of=/dev/null + +log_pass "Can remove with faulted vdevs" |