Detect IO errors during device removal

* Detect IO errors during device removal While device removal cannot verify the checksums of individual blocks during device removal, it can reasonably detect hard IO errors from the leaf vdevs. Failure to perform this error checking can result in device removal completing successfully, but moving no data which will permanently corrupt the pool. Situation 1: faulted/degraded vdevs In the configuration shown below, the removal of mirror-0 will permanently corrupt the pool. Device removal will preferentially copy data from 'vdev1 -> vdev3' and from 'vdev2 -> vdev4'. Which in this case will result in nothing being copied since one vdev in each of those groups in unavailable. However, device removal will complete successfully since all IO errors are ignored. tank DEGRADED 0 0 0 mirror-0 DEGRADED 0 0 0 /var/tmp/vdev1 FAULTED 0 0 0 external fault /var/tmp/vdev2 ONLINE 0 0 0 mirror-1 DEGRADED 0 0 0 /var/tmp/vdev3 ONLINE 0 0 0 /var/tmp/vdev4 FAULTED 0 0 0 external fault This issue is resolved by updating the source child selection logic to exclude unreadable leaf vdevs. Additionally, unwritable destination child vdevs which can never succeed are skipped to prevent generating a large number of write IO errors. Situation 2: individual hard IO errors During removal if an unexpected hard IO error is encountered when either reading or writing the child vdev the entire removal operation is cancelled. While it may be possible to reconstruct the data after removal that cannot be guaranteed. The only strictly safe thing to do is to cancel the removal. As a future improvement we may want to instead suspend the removal process and allow the damaged region to be retried. But that work is left for another time, hard IO errors during the removal process are expected to be exceptionally rare. Reviewed-by: Serapheim Dimitropoulos <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Tom Caputi <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Issue #6900 Closes #8161
author: Brian Behlendorf <[email protected]> 2018-12-04 09:37:37 -0800
committer: GitHub <[email protected]> 2018-12-04 09:37:37 -0800
commit: 7c9a42921e60dbad0e3003bd571591f073860233 (patch)
tree: 7dcdfdf535f286a9c3d4dc5f4996ed0e59c501c2 /module
parent: c40a1124e1d1010b665909ad31d2904630018f6f (diff)
1 files changed, 89 insertions, 11 deletions
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 49b9ed3a1..5952a5d8f 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -80,6 +80,8 @@
 typedef struct vdev_copy_arg {
 	metaslab_t	*vca_msp;
 	uint64_t	vca_outstanding_bytes;
+	uint64_t	vca_read_error_bytes;
+	uint64_t	vca_write_error_bytes;
 	kcondvar_t	vca_cv;
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
@@ -100,6 +102,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
+ * Ignore hard IO errors during device removal.  When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled.  This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
  * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
@@ -126,6 +136,7 @@ int zfs_removal_suspend_progress = 0;
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
 static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
@@ -802,6 +813,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
+
+	if (zio->io_error != 0)
+		vca->vca_write_error_bytes += zio->io_size;
+
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
 }
@@ -813,6 +828,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
+	vdev_copy_arg_t *vca = zio->io_private;
+
+	if (zio->io_error != 0) {
+		mutex_enter(&vca->vca_lock);
+		vca->vca_read_error_bytes += zio->io_size;
+		mutex_exit(&vca->vca_lock);
+	}
+
 	zio_nowait(zio_unique_parent(zio));
 }
 
@@ -866,25 +889,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
 {
 	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
+	/*
+	 * If the destination child in unwritable then there is no point
+	 * in issuing the source reads which cannot be written.
+	 */
+	if (!vdev_writeable(dest_child_vd))
+		return;
+
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes += size;
 	mutex_exit(&vca->vca_lock);
 
 	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
-	vdev_t *source_child_vd;
+	vdev_t *source_child_vd = NULL;
 	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
 		/*
 		 * Source and dest are both mirrors.  Copy from the same
 		 * child id as we are copying to (wrapping around if there
-		 * are more dest children than source children).
+		 * are more dest children than source children).  If the
+		 * preferred source child is unreadable select another.
 		 */
-		source_child_vd =
-		    source_vd->vdev_child[dest_id % source_vd->vdev_children];
+		for (int i = 0; i < source_vd->vdev_children; i++) {
+			source_child_vd = source_vd->vdev_child[
+			    (dest_id + i) % source_vd->vdev_children];
+			if (vdev_readable(source_child_vd))
+				break;
+		}
 	} else {
 		source_child_vd = source_vd;
 	}
 
+	/*
+	 * There should always be at least one readable source child or
+	 * the pool would be in a suspended state.  Somehow selecting an
+	 * unreadable child would result in IO errors, the removal process
+	 * being cancelled, and the pool reverting to its pre-removal state.
+	 */
+	ASSERT3P(source_child_vd, !=, NULL);
+
 	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
 	    dest_child_vd, dest_offset, abd, size,
 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
@@ -1361,6 +1404,8 @@ spa_vdev_remove_thread(void *arg)
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
 	vca.vca_outstanding_bytes = 0;
+	vca.vca_read_error_bytes = 0;
+	vca.vca_write_error_bytes = 0;
 
 	mutex_enter(&svr->svr_lock);
 
@@ -1490,6 +1535,14 @@ spa_vdev_remove_thread(void *arg)
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
 		}
+
+		mutex_enter(&vca.vca_lock);
+		if (zfs_removal_ignore_errors == 0 &&
+		    (vca.vca_read_error_bytes > 0 ||
+		    vca.vca_write_error_bytes > 0)) {
+			svr->svr_thread_exit = B_TRUE;
+		}
+		mutex_exit(&vca.vca_lock);
 	}
 
 	mutex_exit(&svr->svr_lock);
@@ -1511,6 +1564,21 @@ spa_vdev_remove_thread(void *arg)
 		svr->svr_thread = NULL;
 		cv_broadcast(&svr->svr_cv);
 		mutex_exit(&svr->svr_lock);
+
+		/*
+		 * During the removal process an unrecoverable read or write
+		 * error was encountered.  The removal process must be
+		 * cancelled or this damage may become permanent.
+		 */
+		if (zfs_removal_ignore_errors == 0 &&
+		    (vca.vca_read_error_bytes > 0 ||
+		    vca.vca_write_error_bytes > 0)) {
+			zfs_dbgmsg("canceling removal due to IO errors: "
+			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+			    vca.vca_read_error_bytes,
+			    vca.vca_write_error_bytes);
+			spa_vdev_remove_cancel_impl(spa);
+		}
 	} else {
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
 		vdev_remove_complete(spa);
@@ -1689,14 +1757,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 	    vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
 {
-	spa_vdev_remove_suspend(spa);
-
-	if (spa->spa_vdev_removal == NULL)
-		return (ENOTACTIVE);
-
 	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
@@ -1713,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa)
 	return (error);
 }
 
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+	spa_vdev_remove_suspend(spa);
+
+	if (spa->spa_vdev_removal == NULL)
+		return (ENOTACTIVE);
+
+	return (spa_vdev_remove_cancel_impl(spa));
+}
+
 /*
  * Called every sync pass of every txg if there's a svr.
  */
@@ -2162,6 +2236,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 }
 
 #if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+	"Ignore hard IO errors when removing device");
+
 module_param(zfs_remove_max_segment, int, 0644);
 MODULE_PARM_DESC(zfs_remove_max_segment,
 	"Largest contiguous segment to allocate when removing device");
author	Brian Behlendorf <[email protected]>	2018-12-04 09:37:37 -0800
committer	GitHub <[email protected]>	2018-12-04 09:37:37 -0800
commit	7c9a42921e60dbad0e3003bd571591f073860233 (patch)
tree	7dcdfdf535f286a9c3d4dc5f4996ed0e59c501c2 /module
parent	c40a1124e1d1010b665909ad31d2904630018f6f (diff)