15 files changed, 915 insertions, 231 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 02fb55944..852f85355 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -3406,7 +3406,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
@@ -3422,13 +3422,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 			    svr->svr_allocd_segs, SM_ALLOC));
 
 			/*
-			 * Clear everything past what has been synced,
-			 * because we have not allocated mappings for it yet.
+			 * Clear everything past what has been synced unless
+			 * it's past the spacemap, because we have not allocated
+			 * mappings for it yet.
 			 */
-			range_tree_clear(svr->svr_allocd_segs,
-			    vdev_indirect_mapping_max_offset(vim),
-			    msp->ms_sm->sm_start + msp->ms_sm->sm_size -
-			    vdev_indirect_mapping_max_offset(vim));
+			uint64_t vim_max_offset =
+			    vdev_indirect_mapping_max_offset(vim);
+			uint64_t sm_end = msp->ms_sm->sm_start +
+			    msp->ms_sm->sm_size;
+			if (sm_end > vim_max_offset)
+				range_tree_clear(svr->svr_allocd_segs,
+				    vim_max_offset, sm_end - vim_max_offset);
 		}
 
 		zcb->zcb_removing_size +=
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 0e3459d32..c81d446a5 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -445,6 +445,7 @@ static spa_t *ztest_spa = NULL;
 static ztest_ds_t *ztest_ds;
 
 static kmutex_t ztest_vdev_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
 
 /*
  * The ztest_name_lock protects the pool and dataset namespace used by
@@ -3203,7 +3204,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	 * value.  Don't bother trying to attach while we are in the middle
 	 * of removal.
 	 */
-	if (spa->spa_vdev_removal != NULL) {
+	if (ztest_device_removal_active) {
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		mutex_exit(&ztest_vdev_lock);
 		return;
@@ -3375,16 +3376,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
 	spa_t *spa = ztest_spa;
 	vdev_t *vd;
 	uint64_t guid;
+	int error;
 
 	mutex_enter(&ztest_vdev_lock);
 
+	if (ztest_device_removal_active) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * Remove a random top-level vdev and wait for removal to finish.
+	 */
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
 	guid = vd->vdev_guid;
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
-	(void) spa_vdev_remove(spa, guid, B_FALSE);
+	error = spa_vdev_remove(spa, guid, B_FALSE);
+	if (error == 0) {
+		ztest_device_removal_active = B_TRUE;
+		mutex_exit(&ztest_vdev_lock);
+
+		while (spa->spa_vdev_removal != NULL)
+			txg_wait_synced(spa_get_dsl(spa), 0);
+	} else {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
 
+	/*
+	 * The pool needs to be scrubbed after completing device removal.
+	 * Failure to do so may result in checksum errors due to the
+	 * strategy employed by ztest_fault_inject() when selecting which
+	 * offset are redundant and can be damaged.
+	 */
+	error = spa_scan(spa, POOL_SCAN_SCRUB);
+	if (error == 0) {
+		while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+			txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	mutex_enter(&ztest_vdev_lock);
+	ztest_device_removal_active = B_FALSE;
 	mutex_exit(&ztest_vdev_lock);
 }
 
@@ -3524,7 +3558,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 	 * that the metaslab_class space increased (because it decreases
 	 * when the device removal completes).
 	 */
-	if (spa->spa_vdev_removal != NULL) {
+	if (ztest_device_removal_active) {
 		spa_config_exit(spa, SCL_STATE, FTAG);
 		mutex_exit(&ztest_vdev_lock);
 		return;
@@ -5520,6 +5554,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 	pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 	mutex_enter(&ztest_vdev_lock);
+
+	/*
+	 * Device removal is in progress, fault injection must be disabled
+	 * until it completes and the pool is scrubbed.  The fault injection
+	 * strategy for damaging blocks does not take in to account evacuated
+	 * blocks which may have already been damaged.
+	 */
+	if (ztest_device_removal_active) {
+		mutex_exit(&ztest_vdev_lock);
+		goto out;
+	}
+
 	maxfaults = MAXFAULTS(zs);
 	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 	mirror_save = zs->zs_mirrors;
@@ -5875,6 +5921,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 
+	/*
+	 * Scrub in progress by device removal.
+	 */
+	if (ztest_device_removal_active)
+		return;
+
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h
index 5b1e3056b..80ee09456 100644
--- a/include/sys/vdev_removal.h
+++ b/include/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
 #endif
 
 typedef struct spa_vdev_removal {
-	vdev_t		*svr_vdev;
+	uint64_t	svr_vdev_id;
 	uint64_t	svr_max_offset_to_sync[TXG_SIZE];
 	/* Thread performing a vdev removal. */
 	kthread_t	*svr_thread;
diff --git a/include/sys/zio.h b/include/sys/zio.h
index a275b16de..dc6841d83 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -600,7 +600,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    struct abd *data, uint64_t size, int type, zio_priority_t priority,
+    struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 99b4ea29a..2d94cd320 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2879,7 +2879,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 
 	case EBUSY:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
-		    "or pool has removing/removed vdevs"),
+		    "or device removal is in progress"),
 		    new_disk);
 		(void) zfs_error(hdl, EZFS_BADDEV, msg);
 		break;
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 818f26449..6e670facb 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1742,6 +1742,23 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
 .sp
 .ne 2
 .na
+\fBzfs_reconstruct_indirect_segments_max\fR (int)
+.ad
+.RS 12n
+When a split block which is part of a indirect vdev contains more than this
+many segments, consider it too computationally expensive to check all possible
+combinations.  Instead, operate under the assumption that only a few segment
+copies are damaged and the majority of segment copies are good, in which case
+it is reasonable to randomly select sample combinations.  This allows all the
+segment copies to participate fairly in the reconstruction and prevents the
+repeated use of one bad copy.
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
 \fBzfs_recover\fR (int)
 .ad
 .RS 12n
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 53953a6c5..b87b4d555 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2959,6 +2959,19 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
 {
 	vdev_t *vd;
 
+	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+	if (vd->vdev_ops == &vdev_indirect_ops) {
+		/*
+		 * The indirect vdev can point to multiple
+		 * vdevs.  For simplicity, always create
+		 * the resilver zio_t. zio_vdev_io_start()
+		 * will bypass the child resilver i/o's if
+		 * they are on vdevs that don't have DTL's.
+		 */
+		return (B_TRUE);
+	}
+
 	if (DVA_GET_GANG(dva)) {
 		/*
 		 * Gang members may be spread across multiple
@@ -2971,8 +2984,6 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
 		return (B_TRUE);
 	}
 
-	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
-
 	/*
 	 * Check if the txg falls within the range which must be
 	 * resilvered.  DVAs outside this range can always be skipped.
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 25090f089..52a60cc5e 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -3234,7 +3234,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
-	    spa->spa_vdev_removal->svr_vdev == vd &&
+	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 08fc7bbda..9a5346b42 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -4860,8 +4860,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
-			    tvd->vdev_ashift !=
-			    spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz */
@@ -4970,10 +4969,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
-	if (spa->spa_vdev_removal != NULL ||
-	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+	if (spa->spa_vdev_removal != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 5c6e82567..54299a7e4 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1708,9 +1708,12 @@ spa_update_dspace(spa_t *spa)
 		 * allocated twice (on the old device and the new
 		 * device).
 		 */
-		vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+		vdev_t *vd =
+		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 		spa->spa_dspace -= spa_deflate(spa) ?
 		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+		spa_config_exit(spa, SCL_VDEV, FTAG);
 	}
 }
 
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index f3e3f90fa..0dea0b4e4 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -857,6 +857,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
+	/*
+	 * State which may be set on a top-level vdev that's in the
+	 * process of being removed.
+	 */
+	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+	ASSERT0(tvd->vdev_removing);
+	tvd->vdev_removing = svd->vdev_removing;
+	tvd->vdev_indirect_config = svd->vdev_indirect_config;
+	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+	tvd->vdev_indirect_births = svd->vdev_indirect_births;
+	range_tree_swap(&svd->vdev_obsolete_segments,
+	    &tvd->vdev_obsolete_segments);
+	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+	svd->vdev_indirect_config.vic_mapping_object = 0;
+	svd->vdev_indirect_config.vic_births_object = 0;
+	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+	svd->vdev_indirect_mapping = NULL;
+	svd->vdev_indirect_births = NULL;
+	svd->vdev_obsolete_sm = NULL;
+	svd->vdev_removing = 0;
+
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 86a05daa8..3ccdfee3b 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/metaslab.h>
 #include <sys/refcount.h>
 #include <sys/dmu.h>
@@ -44,10 +45,11 @@
  * "vdev_remap" operation that executes a callback on each contiguous
  * segment of the new location.  This function is used in multiple ways:
  *
- *  - reads and repair writes to this device use the callback to create
- *    a child io for each mapped segment.
+ *  - i/os to this vdev use the callback to determine where the
+ *    data is now located, and issue child i/os for each segment's new
+ *    location.
  *
- *  - frees and claims to this device use the callback to free or claim
+ *  - frees and claims to this vdev use the callback to free or claim
  *    each mapped segment.  (Note that we don't actually need to claim
  *    log blocks on indirect vdevs, because we don't allocate to
  *    removing vdevs.  However, zdb uses zio_claim() for its leak
@@ -202,6 +204,95 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
 int zfs_condense_indirect_commit_entry_delay_ms = 0;
 
 /*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. It allows all segment copies to
+ * participate fairly in the reconstruction and prevents repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+	abd_t *ic_data;
+	vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+	list_node_t is_node; /* link on iv_splits */
+
+	/*
+	 * is_split_offset is the offset into the i/o.
+	 * This is the sum of the previous splits' is_size's.
+	 */
+	uint64_t is_split_offset;
+
+	vdev_t *is_vdev; /* top-level vdev */
+	uint64_t is_target_offset; /* offset on is_vdev */
+	uint64_t is_size;
+	int is_children; /* number of entries in is_child[] */
+
+	/*
+	 * is_good_child is the child that we are currently using to
+	 * attempt reconstruction.
+	 */
+	int is_good_child;
+
+	indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+	boolean_t iv_split_block;
+	boolean_t iv_reconstruct;
+
+	list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	indirect_split_t *is;
+	while ((is = list_head(&iv->iv_splits)) != NULL) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic->ic_data != NULL)
+				abd_free(ic->ic_data);
+		}
+		list_remove(&iv->iv_splits, is);
+		kmem_free(is,
+		    offsetof(indirect_split_t, is_child[is->is_children]));
+	}
+	kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+	vdev_indirect_map_free,
+	zio_vsd_default_cksum_report
+};
+
+/*
  * Mark the given offset and size as being obsolete in the given txg.
  */
 void
@@ -814,12 +905,6 @@ vdev_indirect_close(vdev_t *vd)
 }
 
 /* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
-/* ARGSUSED */
 static int
 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
@@ -990,41 +1075,471 @@ vdev_indirect_child_io_done(zio_t *zio)
 	abd_put(zio->io_abd);
 }
 
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
 static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	zio_t *zio = arg;
+	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3P(vd, !=, NULL);
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
-	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
-	    abd_get_offset(zio->io_abd, split_offset),
-	    size, zio->io_type, zio->io_priority,
-	    0, vdev_indirect_child_io_done, zio));
+	int n = 1;
+	if (vd->vdev_ops == &vdev_mirror_ops)
+		n = vd->vdev_children;
+
+	indirect_split_t *is =
+	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+	is->is_children = n;
+	is->is_size = size;
+	is->is_split_offset = split_offset;
+	is->is_target_offset = offset;
+	is->is_vdev = vd;
+
+	/*
+	 * Note that we only consider multiple copies of the data for
+	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
+	 * though they use the same ops as mirror, because there's only one
+	 * "good" copy under the replacing/spare.
+	 */
+	if (vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < n; i++) {
+			is->is_child[i].ic_vdev = vd->vdev_child[i];
+		}
+	} else {
+		is->is_child[0].ic_vdev = vd;
+	}
+
+	list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+	indirect_child_t *ic = zio->io_private;
+
+	if (zio->io_error != 0) {
+		/*
+		 * Clear ic_data to indicate that we do not have data for this
+		 * child.
+		 */
+		abd_free(ic->ic_data);
+		ic->ic_data = NULL;
+	}
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic = &is->is_child[i];
+
+			if (!vdev_readable(ic->ic_vdev))
+				continue;
+
+			/*
+			 * Note, we may read from a child whose DTL
+			 * indicates that the data may not be present here.
+			 * While this might result in a few i/os that will
+			 * likely return incorrect data, it simplifies the
+			 * code since we can treat scrub and resilver
+			 * identically.  (The incorrect data will be
+			 * detected and ignored when we verify the
+			 * checksum.)
+			 */
+
+			ic->ic_data = abd_alloc_sametype(zio->io_abd,
+			    is->is_size);
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
+			    is->is_size, zio->io_type, zio->io_priority, 0,
+			    vdev_indirect_read_split_done, ic));
+		}
+	}
+	iv->iv_reconstruct = B_TRUE;
 }
 
 static void
 vdev_indirect_io_start(zio_t *zio)
 {
 	ASSERTV(spa_t *spa = zio->io_spa);
+	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+	list_create(&iv->iv_splits,
+	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+	zio->io_vsd = iv;
+	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 	if (zio->io_type != ZIO_TYPE_READ) {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-		ASSERT((zio->io_flags &
-		    (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+		/*
+		 * Note: this code can handle other kinds of writes,
+		 * but we don't expect them.
+		 */
+		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
 	}
 
 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
-	    vdev_indirect_io_start_cb, zio);
+	    vdev_indirect_gather_splits, zio);
+
+	indirect_split_t *first = list_head(&iv->iv_splits);
+	if (first->is_size == zio->io_size) {
+		/*
+		 * This is not a split block; we are pointing to the entire
+		 * data, which will checksum the same as the original data.
+		 * Pass the BP down so that the child i/o can verify the
+		 * checksum, and try a different location if available
+		 * (e.g. on a mirror).
+		 *
+		 * While this special case could be handled the same as the
+		 * general (split block) case, doing it this way ensures
+		 * that the vast majority of blocks on indirect vdevs
+		 * (which are not split) are handled identically to blocks
+		 * on non-indirect vdevs.  This allows us to be less strict
+		 * about performance in the general (but rare) case.
+		 */
+		ASSERT0(first->is_split_offset);
+		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    first->is_vdev, first->is_target_offset,
+		    abd_get_offset(zio->io_abd, 0),
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_indirect_child_io_done, zio));
+	} else {
+		iv->iv_split_block = B_TRUE;
+		if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+			/*
+			 * Read all copies.  Note that for simplicity,
+			 * we don't bother consulting the DTL in the
+			 * resilver case.
+			 */
+			vdev_indirect_read_all(zio);
+		} else {
+			/*
+			 * Read one copy of each split segment, from the
+			 * top-level vdev.  Since we don't know the
+			 * checksum of each split individually, the child
+			 * zio can't ensure that we get the right data.
+			 * E.g. if it's a mirror, it will just read from a
+			 * random (healthy) leaf vdev.  We have to verify
+			 * the checksum in vdev_indirect_io_done().
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				zio_nowait(zio_vdev_child_io(zio, NULL,
+				    is->is_vdev, is->is_target_offset,
+				    abd_get_offset(zio->io_abd,
+				    is->is_split_offset), is->is_size,
+				    zio->io_type, zio->io_priority, 0,
+				    vdev_indirect_child_io_done, zio));
+			}
+
+		}
+	}
 
 	zio_execute(zio);
 }
 
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+    indirect_split_t *is, indirect_child_t *ic)
+{
+	vdev_t *vd = ic->ic_vdev;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_checksum_errors++;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	zio_bad_cksum_t zbc = {{{ 0 }}};
+	abd_t *bad_abd = ic->ic_data;
+	abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+	zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+	    is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies.  We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data.  If they differ, then we overwrite the bad data
+ * with the good copy.  Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong).  For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+		flags |= ZIO_FLAG_SELF_HEAL;
+
+	if (!spa_writeable(zio->io_spa))
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic == good_child)
+				continue;
+			if (ic->ic_data == NULL)
+				continue;
+			if (abd_cmp(good_child->ic_data, ic->ic_data) == 0)
+				continue;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset,
+			    good_child->ic_data, is->is_size,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+			    NULL, NULL));
+
+			vdev_indirect_checksum_error(zio, is, ic);
+		}
+	}
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+
+			if (ic->ic_data == NULL)
+				continue;
+
+			vdev_t *vd = ic->ic_vdev;
+
+			mutex_enter(&vd->vdev_stat_lock);
+			vd->vdev_stat.vs_checksum_errors++;
+			mutex_exit(&vd->vdev_stat_lock);
+
+			zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+			    is->is_target_offset, is->is_size,
+			    NULL, NULL, NULL);
+		}
+	}
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror.  The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.  We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly.  (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations.  In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ *       |     mirror child
+ * split |     [0]        [1]
+ * ======|=====================
+ *   A   |  data_A_0   data_A_1
+ *   B   |  data_B_0   data_B_1
+ *   C   |  data_C_0   data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary.  In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit"        "high bit"
+ *        v                 v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max).  This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+	uint64_t attempts = 0;
+	uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+	int segments = 0;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is))
+		segments++;
+
+	for (;;) {
+		/* copy data from splits to main zio */
+		int ret;
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+			/*
+			 * If this child failed, its ic_data will be NULL.
+			 * Skip this combination.
+			 */
+			if (is->is_child[is->is_good_child].ic_data == NULL) {
+				ret = EIO;
+				goto next;
+			}
+
+			abd_copy_off(zio->io_abd,
+			    is->is_child[is->is_good_child].ic_data,
+			    is->is_split_offset, 0, is->is_size);
+		}
+
+		/* See if this checksum matches. */
+		zio_bad_cksum_t zbc;
+		ret = zio_checksum_error(zio, &zbc);
+		if (ret == 0) {
+			/* Found a matching checksum.  Issue repair i/os. */
+			vdev_indirect_repair(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+
+		/*
+		 * Checksum failed; try a different combination of split
+		 * children.
+		 */
+		boolean_t more;
+next:
+		more = B_FALSE;
+		if (segments <= zfs_reconstruct_indirect_segments_max) {
+			/*
+			 * There are relatively few segments, so
+			 * deterministically check all combinations.  We do
+			 * this by by adding one to the first split's
+			 * good_child.  If it overflows, then "carry over" to
+			 * the next split (like counting in base is_children,
+			 * but each digit can have a different base).
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				is->is_good_child++;
+				if (is->is_good_child < is->is_children) {
+					more = B_TRUE;
+					break;
+				}
+				is->is_good_child = 0;
+			}
+		} else if (++attempts < attempts_max) {
+			/*
+			 * There are too many combinations to try all of them
+			 * in a reasonable amount of time, so try a fixed
+			 * number of random combinations, after which we'll
+			 * consider the block unrecoverable.
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				is->is_good_child =
+				    spa_get_random(is->is_children);
+			}
+			more = B_TRUE;
+		}
+		if (!more) {
+			/* All combinations failed. */
+			zio->io_error = ret;
+			vdev_indirect_all_checksum_errors(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+	}
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (iv->iv_reconstruct) {
+		/*
+		 * We have read all copies of the data (e.g. from mirrors),
+		 * either because this was a scrub/resilver, or because the
+		 * one-copy read didn't checksum correctly.
+		 */
+		vdev_indirect_reconstruct_io_done(zio);
+		return;
+	}
+
+	if (!iv->iv_split_block) {
+		/*
+		 * This was not a split block, so we passed the BP down,
+		 * and the checksum was handled by the (one) child zio.
+		 */
+		return;
+	}
+
+	zio_bad_cksum_t zbc;
+	int ret = zio_checksum_error(zio, &zbc);
+	if (ret == 0) {
+		zio_checksum_verified(zio);
+		return;
+	}
+
+	/*
+	 * The checksum didn't match.  Read all copies of all splits, and
+	 * then we will try to reconstruct.  The next time
+	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+	 */
+	vdev_indirect_read_all(zio);
+
+	zio_vdev_io_redone(zio);
+}
+
 vdev_ops_t vdev_indirect_ops = {
 	vdev_indirect_open,
 	vdev_indirect_close,
@@ -1061,4 +1576,8 @@ MODULE_PARM_DESC(zfs_condense_min_mapping_bytes,
 module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
 MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
 	"Delay while condensing vdev mapping");
+
+module_param(zfs_reconstruct_indirect_segments_max, int, 0644);
+MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max,
+	"Maximum number of split segments check all combinations");
 #endif
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index b56f955eb..4b01f317b 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -486,12 +486,15 @@ vdev_mirror_io_start(zio_t *zio)
 	mm = vdev_mirror_map_init(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+		if (zio->io_bp != NULL &&
+		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
 			/*
-			 * For scrubbing reads we need to allocate a read
-			 * buffer for each child and issue reads to all
-			 * children.  If any child succeeds, it will copy its
-			 * data into zio->io_data in vdev_mirror_scrub_done.
+			 * For scrubbing reads (if we can verify the
+			 * checksum here, as indicated by io_bp being
+			 * non-NULL) we need to allocate a read buffer for
+			 * each child and issue reads to all children.  If
+			 * any child succeeds, it will copy its data into
+			 * zio->io_data in vdev_mirror_scrub_done.
 			 */
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
@@ -640,7 +643,21 @@ vdev_mirror_io_done(zio_t *zio)
 			if (mc->mc_error == 0) {
 				if (mc->mc_tried)
 					continue;
+				/*
+				 * We didn't try this child.  We need to
+				 * repair it if:
+				 * 1. it's a scrub (in which case we have
+				 * tried everything that was healthy)
+				 *  - or -
+				 * 2. it's an indirect vdev (in which case
+				 * it could point to any other vdev, which
+				 * might have a bad DTL)
+				 *  - or -
+				 * 3. the DTL indicates that this data is
+				 * missing from this vdev
+				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+				    mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 6e81bf014..0fca8fb03 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -84,18 +84,12 @@ typedef struct vdev_copy_arg {
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
 
-typedef struct vdev_copy_seg_arg {
-	vdev_copy_arg_t	*vcsa_copy_arg;
-	uint64_t	vcsa_txg;
-	dva_t		*vcsa_dest_dva;
-	blkptr_t	*vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
 /*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal.  This determines how much i/o we can have
+ * in flight concurrently.
  */
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
@@ -165,7 +159,7 @@ spa_vdev_removal_create(vdev_t *vd)
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
 	svr->svr_allocd_segs = range_tree_create(NULL, NULL);
-	svr->svr_vdev = vd;
+	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -207,9 +201,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
 static void
 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 {
-	vdev_t *vd = arg;
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
 	spa_vdev_removal_t *svr = NULL;
 	ASSERTV(uint64_t txg = dmu_tx_get_txg(tx));
@@ -331,7 +326,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
 	spa->spa_vdev_removal = svr;
 	svr->svr_thread = thread_create(NULL, 0,
-	    spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
@@ -372,21 +367,24 @@ spa_remove_init(spa_t *spa)
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 		vdev_t *vd = vdev_lookup_top(spa,
 		    spa->spa_removing_phys.sr_removing_vdev);
-		spa_config_exit(spa, SCL_STATE, FTAG);
 
-		if (vd == NULL)
+		if (vd == NULL) {
+			spa_config_exit(spa, SCL_STATE, FTAG);
 			return (EINVAL);
+		}
 
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT(vdev_is_concrete(vd));
 		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
-		ASSERT(svr->svr_vdev->vdev_removing);
+		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+		ASSERT(vd->vdev_removing);
 
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
+		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		spa->spa_vdev_removal = svr;
 	}
@@ -439,15 +437,8 @@ spa_restart_removal(spa_t *spa)
 	if (!spa_writeable(spa))
 		return;
 
-	vdev_t *vd = svr->svr_vdev;
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-	ASSERT3P(vd, !=, NULL);
-	ASSERT(vd->vdev_removing);
-
-	zfs_dbgmsg("restarting removal of %llu at count=%llu",
-	    vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
-	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+	zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
 	    0, &p0, TS_RUN, minclsyspri);
 }
 
@@ -468,7 +459,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
-	ASSERT3P(vd, ==, svr->svr_vdev);
+	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
 	mutex_enter(&svr->svr_lock);
@@ -646,7 +637,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
 
 	if (state == DSS_FINISHED) {
 		spa_removing_phys_t *srp = &spa->spa_removing_phys;
-		vdev_t *vd = svr->svr_vdev;
+		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -690,7 +681,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -718,64 +709,128 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 	spa_sync_removing_state(spa, tx);
 }
 
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_nullzio_done(zio_t *zio)
+{
+	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
 static void
 spa_vdev_copy_segment_write_done(zio_t *zio)
 {
-	vdev_copy_seg_arg_t *vcsa = zio->io_private;
-	vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
-	spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+	vdev_copy_arg_t *vca = zio->io_private;
+
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
-
-	ASSERT0(zio->io_error);
-	kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
-	kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
 }
 
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
-	vdev_copy_seg_arg_t *vcsa = zio->io_private;
-	dva_t *dest_dva = vcsa->vcsa_dest_dva;
-	uint64_t txg = vcsa->vcsa_txg;
-	spa_t *spa = zio->io_spa;
-	ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)));
-	blkptr_t *bp = NULL;
-	dva_t *dva = NULL;
-	uint64_t size = zio->io_size;
-
-	ASSERT3P(dest_vd, !=, NULL);
-	ASSERT0(zio->io_error);
-
-	vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	bp = vcsa->vcsa_dest_bp;
-	dva = bp->blk_dva;
-
-	BP_ZERO(bp);
-
-	/* initialize with dest_dva */
-	bcopy(dest_dva, dva, sizeof (dva_t));
-	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
-	    txg, bp, zio->io_abd, size,
-	    spa_vdev_copy_segment_write_done, vcsa,
-	    ZIO_PRIORITY_REMOVAL, 0, NULL));
+	zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible.  Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs.  However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads.  If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ *                            null
+ *                           /    \
+ *    write(new vdev, child 0)      write(new vdev, child 1)
+ *      |                             |
+ *    read(old vdev, child 0)       read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete.  However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete.  In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*.  We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+    vdev_t *source_vd, uint64_t source_offset,
+    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+	mutex_enter(&vca->vca_lock);
+	vca->vca_outstanding_bytes += size;
+	mutex_exit(&vca->vca_lock);
+
+	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+	vdev_t *source_child_vd;
+	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+		/*
+		 * Source and dest are both mirrors.  Copy from the same
+		 * child id as we are copying to (wrapping around if there
+		 * are more dest children than source children).
+		 */
+		source_child_vd =
+		    source_vd->vdev_child[dest_id % source_vd->vdev_children];
+	} else {
+		source_child_vd = source_vd;
+	}
+
+	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+	    dest_child_vd, dest_offset, abd, size,
+	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_write_done, vca);
+
+	zio_nowait(zio_vdev_child_io(write_zio, NULL,
+	    source_child_vd, source_offset, abd, size,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_read_done, vca));
 }
 
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
 static int
 spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
@@ -784,10 +839,7 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
-	vdev_copy_seg_arg_t *private;
 	dva_t dst = {{ 0 }};
-	blkptr_t blk, *bp = &blk;
-	dva_t *dva = bp->blk_dva;
 
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
@@ -804,51 +856,28 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
 	 */
 	ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
 
-	mutex_enter(&vca->vca_lock);
-	vca->vca_outstanding_bytes += size;
-	mutex_exit(&vca->vca_lock);
-
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
 
-	private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
-	private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
-	private->vcsa_txg = txg;
-	private->vcsa_copy_arg = vca;
-
 	/*
-	 * This lock is eventually released by the donefunc for the
-	 * zio_write_phys that finishes copying the data.
+	 * See comment before spa_vdev_copy_one_child().
 	 */
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-	/*
-	 * Do logical I/O, letting the redundancy vdevs (like mirror)
-	 * handle their own I/O instead of duplicating that code here.
-	 */
-	BP_ZERO(bp);
-
-	DVA_SET_VDEV(&dva[0], vd->vdev_id);
-	DVA_SET_OFFSET(&dva[0], start);
-	DVA_SET_GANG(&dva[0], 0);
-	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
-	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
-	    bp, abd_alloc_for_io(size, B_FALSE), size,
-	    spa_vdev_copy_segment_read_done, private,
-	    ZIO_PRIORITY_REMOVAL, 0, NULL));
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+	    spa_vdev_copy_nullzio_done, NULL, 0);
+	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < dest_vd->vdev_children; i++) {
+			vdev_t *child = dest_vd->vdev_child[i];
+			spa_vdev_copy_one_child(vca, nzio, vd, start,
+			    child, DVA_GET_OFFSET(&dst), i, size);
+		}
+	} else {
+		spa_vdev_copy_one_child(vca, nzio, vd, start,
+		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+	}
+	zio_nowait(nzio);
 
 	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
 	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -866,8 +895,8 @@ static void
 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
-	vdev_t *vd = svr->svr_vdev;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
@@ -896,37 +925,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 }
 
 static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
-	ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
-	ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
-	ASSERT(vd->vdev_indirect_mapping != NULL);
-	ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
-	vd->vdev_indirect_mapping = NULL;
-
-	ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
-	ASSERT(vd->vdev_indirect_births != NULL);
-	ivd->vdev_indirect_births = vd->vdev_indirect_births;
-	vd->vdev_indirect_births = NULL;
-
-	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
-	ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
-	if (vd->vdev_obsolete_sm != NULL) {
-		ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
-		/*
-		 * We cannot use space_map_{open,close} because we hold all
-		 * the config locks as writer.
-		 */
-		ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
-		ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
-		vd->vdev_obsolete_sm = NULL;
-	}
-}
-
-static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
@@ -961,17 +959,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
 
 	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+	ivd->vdev_removing = 0;
 
 	vd->vdev_leaf_zap = 0;
 
 	vdev_remove_child(ivd, vd);
 	vdev_compact_children(ivd);
 
-	vdev_indirect_state_transfer(ivd, vd);
-
-	svr->svr_vdev = ivd;
-
-	ASSERT(!ivd->vdev_removing);
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -994,9 +988,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
  * context by the removal thread after we have copied all vdev's data.
  */
 static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
 {
-	spa_t *spa = vd->vdev_spa;
 	uint64_t txg;
 
 	/*
@@ -1004,8 +997,12 @@ vdev_remove_complete(vdev_t *vd)
 	 * vdev_metaslab_fini()
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
-
 	txg = spa_vdev_enter(spa);
+	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+	sysevent_t *ev = spa_event_create(spa, vd, NULL,
+	    ESC_ZFS_VDEV_REMOVE_DEV);
+
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    vd->vdev_id, txg);
 
@@ -1025,6 +1022,10 @@ vdev_remove_complete(vdev_t *vd)
 	/*
 	 * We now release the locks, allowing spa_sync to run and finish the
 	 * removal via vdev_remove_complete_sync in syncing context.
+	 *
+	 * Note that we hold on to the vdev_t that has been replaced.  Since
+	 * it isn't part of the vdev tree any longer, it can't be concurrently
+	 * manipulated, even while we don't have the config lock.
 	 */
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
@@ -1046,6 +1047,9 @@ vdev_remove_complete(vdev_t *vd)
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 	(void) spa_vdev_exit(spa, vd, txg, 0);
+
+	if (ev != NULL)
+		spa_event_post(ev);
 }
 
 /*
@@ -1056,7 +1060,7 @@ vdev_remove_complete(vdev_t *vd)
  * this size again this txg.
  */
 static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
     uint64_t *max_alloc, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
@@ -1095,7 +1099,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 	while (length > 0) {
 		uint64_t mylen = MIN(length, thismax);
 
-		int error = spa_vdev_copy_segment(svr->svr_vdev,
+		int error = spa_vdev_copy_segment(vd,
 		    offset, mylen, txg, vca, &zal);
 
 		if (error == ENOSPC) {
@@ -1153,12 +1157,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 static void
 spa_vdev_remove_thread(void *arg)
 {
-	vdev_t *vd = arg;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
 	uint64_t max_alloc = zfs_remove_max_segment;
 	uint64_t last_txg = 0;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
 
@@ -1166,7 +1172,6 @@ spa_vdev_remove_thread(void *arg)
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_removing);
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-	ASSERT3P(svr->svr_vdev, ==, vd);
 	ASSERT(vim != NULL);
 
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1247,6 +1252,17 @@ spa_vdev_remove_thread(void *arg)
 
 			mutex_exit(&svr->svr_lock);
 
+			/*
+			 * We need to periodically drop the config lock so that
+			 * writers can get in.  Additionally, we can't wait
+			 * for a txg to sync while holding a config lock
+			 * (since a waiting writer could cause a 3-way deadlock
+			 * with the sync thread, which also gets a config
+			 * lock for reader).  So we can't hold the config lock
+			 * while calling dmu_tx_assign().
+			 */
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+
 			mutex_enter(&vca.vca_lock);
 			while (vca.vca_outstanding_bytes >
 			    zfs_remove_max_copy_bytes) {
@@ -1260,11 +1276,19 @@ spa_vdev_remove_thread(void *arg)
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
+			/*
+			 * Reacquire the vdev_config lock.  The vdev_t
+			 * that we're removing may have changed, e.g. due
+			 * to a vdev_attach or vdev_detach.
+			 */
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
 			if (txg != last_txg)
 				max_alloc = zfs_remove_max_segment;
 			last_txg = txg;
 
-			spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
 
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
@@ -1272,6 +1296,9 @@ spa_vdev_remove_thread(void *arg)
 	}
 
 	mutex_exit(&svr->svr_lock);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
 	/*
 	 * Wait for all copies to finish before cleaning up the vca.
 	 */
@@ -1289,7 +1316,7 @@ spa_vdev_remove_thread(void *arg)
 		mutex_exit(&svr->svr_lock);
 	} else {
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
-		vdev_remove_complete(vd);
+		vdev_remove_complete(spa);
 	}
 }
 
@@ -1330,7 +1357,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	objset_t *mos = spa->spa_meta_objset;
@@ -1403,8 +1430,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 			 * because we have not allocated mappings for it yet.
 			 */
 			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
-			range_tree_clear(svr->svr_allocd_segs, syncd,
-			    msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+			uint64_t sm_end = msp->ms_sm->sm_start +
+			    msp->ms_sm->sm_size;
+			if (sm_end > syncd)
+				range_tree_clear(svr->svr_allocd_segs,
+				    syncd, sm_end - syncd);
 
 			mutex_exit(&svr->svr_lock);
 		}
@@ -1465,7 +1495,7 @@ spa_vdev_remove_cancel(spa_t *spa)
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 
-	uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
@@ -1774,7 +1804,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_initiate_sync,
-	    vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+	    (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 5259a0c6f..9bb2459bb 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1212,17 +1212,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
-	/*
-	 * In the common case, where the parent zio was to a normal vdev,
-	 * the child zio must be to a child vdev of that vdev.  Otherwise,
-	 * the child zio must be to a top-level vdev.
-	 */
-	if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
-		ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
-	} else {
-		ASSERT3P(vd, ==, vd->vdev_top);
-	}
-
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
@@ -1283,7 +1272,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
-    int type, zio_priority_t priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
@@ -3481,7 +3470,7 @@ zio_vdev_io_start(zio_t *zio)
 		 */
 		ASSERT(zio->io_flags &
 		    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
-		    ZIO_FLAG_INDUCE_DAMAGE));
+		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
@@ -3521,18 +3510,37 @@ zio_vdev_io_start(zio_t *zio)
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
-	 * This prevents spurious resilvering with nested replication.
-	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
-	 * A is out of date, we'll read from C+D, then use the data to
-	 * resilver A+B -- but we don't actually want to resilver B, just A.
-	 * The top-level mirror has no way to know this, so instead we just
-	 * discard unnecessary repairs as we work our way down the vdev tree.
-	 * The same logic applies to any form of nested replication:
-	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 * This prevents spurious resilvering.
+	 *
+	 * There are a few ways that we can end up creating these spurious
+	 * resilver i/os:
+	 *
+	 * 1. A resilver i/o will be issued if any DVA in the BP has a
+	 * dirty DTL.  The mirror code will issue resilver writes to
+	 * each DVA, including the one(s) that are not on vdevs with dirty
+	 * DTLs.
+	 *
+	 * 2. With nested replication, which happens when we have a
+	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+	 * For example, given mirror(replacing(A+B), C), it's likely that
+	 * only A is out of date (it's the new device). In this case, we'll
+	 * read from C, then use the data to resilver A+B -- but we don't
+	 * actually want to resilver B, just A. The top-level mirror has no
+	 * way to know this, so instead we just discard unnecessary repairs
+	 * as we work our way down the vdev tree.
+	 *
+	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+	 * The same logic applies to any form of nested replication: ditto
+	 * + mirror, RAID-Z + replacing, etc.
+	 *
+	 * However, indirect vdevs point off to other vdevs which may have
+	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
+	 * will be properly bypassed instead.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    vd->vdev_ops != &vdev_indirect_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);