1 files changed, 89 insertions, 11 deletions
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 49b9ed3a1..5952a5d8f 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -80,6 +80,8 @@
 typedef struct vdev_copy_arg {
 	metaslab_t	*vca_msp;
 	uint64_t	vca_outstanding_bytes;
+	uint64_t	vca_read_error_bytes;
+	uint64_t	vca_write_error_bytes;
 	kcondvar_t	vca_cv;
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
@@ -100,6 +102,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
+ * Ignore hard IO errors during device removal.  When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled.  This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
  * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
@@ -126,6 +136,7 @@ int zfs_removal_suspend_progress = 0;
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
 static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
@@ -802,6 +813,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
+
+	if (zio->io_error != 0)
+		vca->vca_write_error_bytes += zio->io_size;
+
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
 }
@@ -813,6 +828,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
+	vdev_copy_arg_t *vca = zio->io_private;
+
+	if (zio->io_error != 0) {
+		mutex_enter(&vca->vca_lock);
+		vca->vca_read_error_bytes += zio->io_size;
+		mutex_exit(&vca->vca_lock);
+	}
+
 	zio_nowait(zio_unique_parent(zio));
 }
 
@@ -866,25 +889,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
 {
 	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
+	/*
+	 * If the destination child in unwritable then there is no point
+	 * in issuing the source reads which cannot be written.
+	 */
+	if (!vdev_writeable(dest_child_vd))
+		return;
+
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes += size;
 	mutex_exit(&vca->vca_lock);
 
 	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
-	vdev_t *source_child_vd;
+	vdev_t *source_child_vd = NULL;
 	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
 		/*
 		 * Source and dest are both mirrors.  Copy from the same
 		 * child id as we are copying to (wrapping around if there
-		 * are more dest children than source children).
+		 * are more dest children than source children).  If the
+		 * preferred source child is unreadable select another.
 		 */
-		source_child_vd =
-		    source_vd->vdev_child[dest_id % source_vd->vdev_children];
+		for (int i = 0; i < source_vd->vdev_children; i++) {
+			source_child_vd = source_vd->vdev_child[
+			    (dest_id + i) % source_vd->vdev_children];
+			if (vdev_readable(source_child_vd))
+				break;
+		}
 	} else {
 		source_child_vd = source_vd;
 	}
 
+	/*
+	 * There should always be at least one readable source child or
+	 * the pool would be in a suspended state.  Somehow selecting an
+	 * unreadable child would result in IO errors, the removal process
+	 * being cancelled, and the pool reverting to its pre-removal state.
+	 */
+	ASSERT3P(source_child_vd, !=, NULL);
+
 	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
 	    dest_child_vd, dest_offset, abd, size,
 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
@@ -1361,6 +1404,8 @@ spa_vdev_remove_thread(void *arg)
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
 	vca.vca_outstanding_bytes = 0;
+	vca.vca_read_error_bytes = 0;
+	vca.vca_write_error_bytes = 0;
 
 	mutex_enter(&svr->svr_lock);
 
@@ -1490,6 +1535,14 @@ spa_vdev_remove_thread(void *arg)
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
 		}
+
+		mutex_enter(&vca.vca_lock);
+		if (zfs_removal_ignore_errors == 0 &&
+		    (vca.vca_read_error_bytes > 0 ||
+		    vca.vca_write_error_bytes > 0)) {
+			svr->svr_thread_exit = B_TRUE;
+		}
+		mutex_exit(&vca.vca_lock);
 	}
 
 	mutex_exit(&svr->svr_lock);
@@ -1511,6 +1564,21 @@ spa_vdev_remove_thread(void *arg)
 		svr->svr_thread = NULL;
 		cv_broadcast(&svr->svr_cv);
 		mutex_exit(&svr->svr_lock);
+
+		/*
+		 * During the removal process an unrecoverable read or write
+		 * error was encountered.  The removal process must be
+		 * cancelled or this damage may become permanent.
+		 */
+		if (zfs_removal_ignore_errors == 0 &&
+		    (vca.vca_read_error_bytes > 0 ||
+		    vca.vca_write_error_bytes > 0)) {
+			zfs_dbgmsg("canceling removal due to IO errors: "
+			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+			    vca.vca_read_error_bytes,
+			    vca.vca_write_error_bytes);
+			spa_vdev_remove_cancel_impl(spa);
+		}
 	} else {
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
 		vdev_remove_complete(spa);
@@ -1689,14 +1757,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 	    vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
 {
-	spa_vdev_remove_suspend(spa);
-
-	if (spa->spa_vdev_removal == NULL)
-		return (ENOTACTIVE);
-
 	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
@@ -1713,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa)
 	return (error);
 }
 
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+	spa_vdev_remove_suspend(spa);
+
+	if (spa->spa_vdev_removal == NULL)
+		return (ENOTACTIVE);
+
+	return (spa_vdev_remove_cancel_impl(spa));
+}
+
 /*
  * Called every sync pass of every txg if there's a svr.
  */
@@ -2162,6 +2236,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 }
 
 #if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+	"Ignore hard IO errors when removing device");
+
 module_param(zfs_remove_max_segment, int, 0644);
 MODULE_PARM_DESC(zfs_remove_max_segment,
 	"Largest contiguous segment to allocate when removing device");