summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c18
-rw-r--r--cmd/ztest/ztest.c58
-rw-r--r--include/sys/vdev_removal.h2
-rw-r--r--include/sys/zio.h2
-rw-r--r--lib/libzfs/libzfs_pool.c2
-rw-r--r--man/man5/zfs-module-parameters.517
-rw-r--r--module/zfs/dsl_scan.c15
-rw-r--r--module/zfs/metaslab.c2
-rw-r--r--module/zfs/spa.c7
-rw-r--r--module/zfs/spa_misc.c5
-rw-r--r--module/zfs/vdev.c26
-rw-r--r--module/zfs/vdev_indirect.c553
-rw-r--r--module/zfs/vdev_mirror.c27
-rw-r--r--module/zfs/vdev_removal.c362
-rw-r--r--module/zfs/zio.c50
15 files changed, 915 insertions, 231 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 02fb55944..852f85355 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -3406,7 +3406,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
@@ -3422,13 +3422,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
svr->svr_allocd_segs, SM_ALLOC));
/*
- * Clear everything past what has been synced,
- * because we have not allocated mappings for it yet.
+ * Clear everything past what has been synced unless
+ * it's past the spacemap, because we have not allocated
+ * mappings for it yet.
*/
- range_tree_clear(svr->svr_allocd_segs,
- vdev_indirect_mapping_max_offset(vim),
- msp->ms_sm->sm_start + msp->ms_sm->sm_size -
- vdev_indirect_mapping_max_offset(vim));
+ uint64_t vim_max_offset =
+ vdev_indirect_mapping_max_offset(vim);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > vim_max_offset)
+ range_tree_clear(svr->svr_allocd_segs,
+ vim_max_offset, sm_end - vim_max_offset);
}
zcb->zcb_removing_size +=
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 0e3459d32..c81d446a5 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -445,6 +445,7 @@ static spa_t *ztest_spa = NULL;
static ztest_ds_t *ztest_ds;
static kmutex_t ztest_vdev_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
/*
* The ztest_name_lock protects the pool and dataset namespace used by
@@ -3203,7 +3204,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
* value. Don't bother trying to attach while we are in the middle
* of removal.
*/
- if (spa->spa_vdev_removal != NULL) {
+ if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
@@ -3375,16 +3376,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
spa_t *spa = ztest_spa;
vdev_t *vd;
uint64_t guid;
+ int error;
mutex_enter(&ztest_vdev_lock);
+ if (ztest_device_removal_active) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * Remove a random top-level vdev and wait for removal to finish.
+ */
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
guid = vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);
- (void) spa_vdev_remove(spa, guid, B_FALSE);
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ if (error == 0) {
+ ztest_device_removal_active = B_TRUE;
+ mutex_exit(&ztest_vdev_lock);
+
+ while (spa->spa_vdev_removal != NULL)
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ } else {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+ /*
+ * The pool needs to be scrubbed after completing device removal.
+ * Failure to do so may result in checksum errors due to the
+ * strategy employed by ztest_fault_inject() when selecting which
+ * offset are redundant and can be damaged.
+ */
+ error = spa_scan(spa, POOL_SCAN_SCRUB);
+ if (error == 0) {
+ while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ mutex_enter(&ztest_vdev_lock);
+ ztest_device_removal_active = B_FALSE;
mutex_exit(&ztest_vdev_lock);
}
@@ -3524,7 +3558,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
* that the metaslab_class space increased (because it decreases
* when the device removal completes).
*/
- if (spa->spa_vdev_removal != NULL) {
+ if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_STATE, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
@@ -5520,6 +5554,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
mutex_enter(&ztest_vdev_lock);
+
+ /*
+ * Device removal is in progress, fault injection must be disabled
+ * until it completes and the pool is scrubbed. The fault injection
+ * strategy for damaging blocks does not take in to account evacuated
+ * blocks which may have already been damaged.
+ */
+ if (ztest_device_removal_active) {
+ mutex_exit(&ztest_vdev_lock);
+ goto out;
+ }
+
maxfaults = MAXFAULTS(zs);
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
mirror_save = zs->zs_mirrors;
@@ -5875,6 +5921,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
+ /*
+ * Scrub in progress by device removal.
+ */
+ if (ztest_device_removal_active)
+ return;
+
(void) spa_scan(spa, POOL_SCAN_SCRUB);
(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
(void) spa_scan(spa, POOL_SCAN_SCRUB);
diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h
index 5b1e3056b..80ee09456 100644
--- a/include/sys/vdev_removal.h
+++ b/include/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
#endif
typedef struct spa_vdev_removal {
- vdev_t *svr_vdev;
+ uint64_t svr_vdev_id;
uint64_t svr_max_offset_to_sync[TXG_SIZE];
/* Thread performing a vdev removal. */
kthread_t *svr_thread;
diff --git a/include/sys/zio.h b/include/sys/zio.h
index a275b16de..dc6841d83 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -600,7 +600,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
- struct abd *data, uint64_t size, int type, zio_priority_t priority,
+ struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 99b4ea29a..2d94cd320 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2879,7 +2879,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
- "or pool has removing/removed vdevs"),
+ "or device removal is in progress"),
new_disk);
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 818f26449..6e670facb 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1742,6 +1742,23 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp
.ne 2
.na
+\fBzfs_reconstruct_indirect_segments_max\fR (int)
+.ad
+.RS 12n
+When a split block which is part of a indirect vdev contains more than this
+many segments, consider it too computationally expensive to check all possible
+combinations. Instead, operate under the assumption that only a few segment
+copies are damaged and the majority of segment copies are good, in which case
+it is reasonable to randomly select sample combinations. This allows all the
+segment copies to participate fairly in the reconstruction and prevents the
+repeated use of one bad copy.
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_recover\fR (int)
.ad
.RS 12n
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 53953a6c5..b87b4d555 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2959,6 +2959,19 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
{
vdev_t *vd;
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * The indirect vdev can point to multiple
+ * vdevs. For simplicity, always create
+ * the resilver zio_t. zio_vdev_io_start()
+ * will bypass the child resilver i/o's if
+ * they are on vdevs that don't have DTL's.
+ */
+ return (B_TRUE);
+ }
+
if (DVA_GET_GANG(dva)) {
/*
* Gang members may be spread across multiple
@@ -2971,8 +2984,6 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
return (B_TRUE);
}
- vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
-
/*
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 25090f089..52a60cc5e 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -3234,7 +3234,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
return;
if (spa->spa_vdev_removal != NULL &&
- spa->spa_vdev_removal->svr_vdev == vd &&
+ spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
vdev_is_concrete(vd)) {
/*
* Note: we check if the vdev is concrete because when
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 08fc7bbda..9a5346b42 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -4860,8 +4860,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
if (spa->spa_vdev_removal != NULL &&
- tvd->vdev_ashift !=
- spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+ tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
@@ -4970,10 +4969,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
- if (spa->spa_vdev_removal != NULL ||
- spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
- }
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 5c6e82567..54299a7e4 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1708,9 +1708,12 @@ spa_update_dspace(spa_t *spa)
* allocated twice (on the old device and the new
* device).
*/
- vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vdev_t *vd =
+ vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
spa->spa_dspace -= spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
}
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index f3e3f90fa..0dea0b4e4 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -857,6 +857,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;
+ /*
+ * State which may be set on a top-level vdev that's in the
+ * process of being removed.
+ */
+ ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+ ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+ ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+ ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+ ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+ ASSERT0(tvd->vdev_removing);
+ tvd->vdev_removing = svd->vdev_removing;
+ tvd->vdev_indirect_config = svd->vdev_indirect_config;
+ tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+ tvd->vdev_indirect_births = svd->vdev_indirect_births;
+ range_tree_swap(&svd->vdev_obsolete_segments,
+ &tvd->vdev_obsolete_segments);
+ tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+ svd->vdev_indirect_config.vic_mapping_object = 0;
+ svd->vdev_indirect_config.vic_births_object = 0;
+ svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+ svd->vdev_indirect_mapping = NULL;
+ svd->vdev_indirect_births = NULL;
+ svd->vdev_obsolete_sm = NULL;
+ svd->vdev_removing = 0;
+
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 86a05daa8..3ccdfee3b 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/metaslab.h>
#include <sys/refcount.h>
#include <sys/dmu.h>
@@ -44,10 +45,11 @@
* "vdev_remap" operation that executes a callback on each contiguous
* segment of the new location. This function is used in multiple ways:
*
- * - reads and repair writes to this device use the callback to create
- * a child io for each mapped segment.
+ * - i/os to this vdev use the callback to determine where the
+ * data is now located, and issue child i/os for each segment's new
+ * location.
*
- * - frees and claims to this device use the callback to free or claim
+ * - frees and claims to this vdev use the callback to free or claim
* each mapped segment. (Note that we don't actually need to claim
* log blocks on indirect vdevs, because we don't allocate to
* removing vdevs. However, zdb uses zio_claim() for its leak
@@ -202,6 +204,95 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
int zfs_condense_indirect_commit_entry_delay_ms = 0;
/*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. It allows all segment copies to
+ * participate fairly in the reconstruction and prevents repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+ abd_t *ic_data;
+ vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+ list_node_t is_node; /* link on iv_splits */
+
+ /*
+ * is_split_offset is the offset into the i/o.
+ * This is the sum of the previous splits' is_size's.
+ */
+ uint64_t is_split_offset;
+
+ vdev_t *is_vdev; /* top-level vdev */
+ uint64_t is_target_offset; /* offset on is_vdev */
+ uint64_t is_size;
+ int is_children; /* number of entries in is_child[] */
+
+ /*
+ * is_good_child is the child that we are currently using to
+ * attempt reconstruction.
+ */
+ int is_good_child;
+
+ indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+ boolean_t iv_split_block;
+ boolean_t iv_reconstruct;
+
+ list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ indirect_split_t *is;
+ while ((is = list_head(&iv->iv_splits)) != NULL) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic->ic_data != NULL)
+ abd_free(ic->ic_data);
+ }
+ list_remove(&iv->iv_splits, is);
+ kmem_free(is,
+ offsetof(indirect_split_t, is_child[is->is_children]));
+ }
+ kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+ vdev_indirect_map_free,
+ zio_vsd_default_cksum_report
+};
+
+/*
* Mark the given offset and size as being obsolete in the given txg.
*/
void
@@ -814,12 +905,6 @@ vdev_indirect_close(vdev_t *vd)
}
/* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
-/* ARGSUSED */
static int
vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
@@ -990,41 +1075,471 @@ vdev_indirect_child_io_done(zio_t *zio)
abd_put(zio->io_abd);
}
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
zio_t *zio = arg;
+ indirect_vsd_t *iv = zio->io_vsd;
ASSERT3P(vd, !=, NULL);
if (vd->vdev_ops == &vdev_indirect_ops)
return;
- zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
- abd_get_offset(zio->io_abd, split_offset),
- size, zio->io_type, zio->io_priority,
- 0, vdev_indirect_child_io_done, zio));
+ int n = 1;
+ if (vd->vdev_ops == &vdev_mirror_ops)
+ n = vd->vdev_children;
+
+ indirect_split_t *is =
+ kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+ is->is_children = n;
+ is->is_size = size;
+ is->is_split_offset = split_offset;
+ is->is_target_offset = offset;
+ is->is_vdev = vd;
+
+ /*
+ * Note that we only consider multiple copies of the data for
+ * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even
+ * though they use the same ops as mirror, because there's only one
+ * "good" copy under the replacing/spare.
+ */
+ if (vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < n; i++) {
+ is->is_child[i].ic_vdev = vd->vdev_child[i];
+ }
+ } else {
+ is->is_child[0].ic_vdev = vd;
+ }
+
+ list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+ indirect_child_t *ic = zio->io_private;
+
+ if (zio->io_error != 0) {
+ /*
+ * Clear ic_data to indicate that we do not have data for this
+ * child.
+ */
+ abd_free(ic->ic_data);
+ ic->ic_data = NULL;
+ }
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+
+ if (!vdev_readable(ic->ic_vdev))
+ continue;
+
+ /*
+ * Note, we may read from a child whose DTL
+ * indicates that the data may not be present here.
+ * While this might result in a few i/os that will
+ * likely return incorrect data, it simplifies the
+ * code since we can treat scrub and resilver
+ * identically. (The incorrect data will be
+ * detected and ignored when we verify the
+ * checksum.)
+ */
+
+ ic->ic_data = abd_alloc_sametype(zio->io_abd,
+ is->is_size);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset, ic->ic_data,
+ is->is_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_read_split_done, ic));
+ }
+ }
+ iv->iv_reconstruct = B_TRUE;
}
static void
vdev_indirect_io_start(zio_t *zio)
{
ASSERTV(spa_t *spa = zio->io_spa);
+ indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+ list_create(&iv->iv_splits,
+ sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+ zio->io_vsd = iv;
+ zio->io_vsd_ops = &vdev_indirect_vsd_ops;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (zio->io_type != ZIO_TYPE_READ) {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
- ASSERT((zio->io_flags &
- (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+ /*
+ * Note: this code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
}
vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
- vdev_indirect_io_start_cb, zio);
+ vdev_indirect_gather_splits, zio);
+
+ indirect_split_t *first = list_head(&iv->iv_splits);
+ if (first->is_size == zio->io_size) {
+ /*
+ * This is not a split block; we are pointing to the entire
+ * data, which will checksum the same as the original data.
+ * Pass the BP down so that the child i/o can verify the
+ * checksum, and try a different location if available
+ * (e.g. on a mirror).
+ *
+ * While this special case could be handled the same as the
+ * general (split block) case, doing it this way ensures
+ * that the vast majority of blocks on indirect vdevs
+ * (which are not split) are handled identically to blocks
+ * on non-indirect vdevs. This allows us to be less strict
+ * about performance in the general (but rare) case.
+ */
+ ASSERT0(first->is_split_offset);
+ ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ first->is_vdev, first->is_target_offset,
+ abd_get_offset(zio->io_abd, 0),
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ } else {
+ iv->iv_split_block = B_TRUE;
+ if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ /*
+ * Read all copies. Note that for simplicity,
+ * we don't bother consulting the DTL in the
+ * resilver case.
+ */
+ vdev_indirect_read_all(zio);
+ } else {
+ /*
+ * Read one copy of each split segment, from the
+ * top-level vdev. Since we don't know the
+ * checksum of each split individually, the child
+ * zio can't ensure that we get the right data.
+ * E.g. if it's a mirror, it will just read from a
+ * random (healthy) leaf vdev. We have to verify
+ * the checksum in vdev_indirect_io_done().
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ is->is_vdev, is->is_target_offset,
+ abd_get_offset(zio->io_abd,
+ is->is_split_offset), is->is_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ }
+
+ }
+ }
zio_execute(zio);
}
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+ indirect_split_t *is, indirect_child_t *ic)
+{
+ vdev_t *vd = ic->ic_vdev;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zio_bad_cksum_t zbc = {{{ 0 }}};
+ abd_t *bad_abd = ic->ic_data;
+ abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+ zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+ is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies. We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data. If they differ, then we overwrite the bad data
+ * with the good copy. Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong). For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+ if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+ flags |= ZIO_FLAG_SELF_HEAL;
+
+ if (!spa_writeable(zio->io_spa))
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic == good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+ if (abd_cmp(good_child->ic_data, ic->ic_data) == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset,
+ good_child->ic_data, is->is_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL));
+
+ vdev_indirect_checksum_error(zio, is, ic);
+ }
+ }
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic->ic_data == NULL)
+ continue;
+
+ vdev_t *vd = ic->ic_vdev;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+ is->is_target_offset, is->is_size,
+ NULL, NULL, NULL);
+ }
+ }
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror. The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually. We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly. (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ * | mirror child
+ * split | [0] [1]
+ * ======|=====================
+ * A | data_A_0 data_A_1
+ * B | data_B_0 data_B_1
+ * C | data_C_0 data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary. In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit" "high bit"
+ * v v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+ uint64_t attempts = 0;
+ uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+ int segments = 0;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is))
+ segments++;
+
+ for (;;) {
+ /* copy data from splits to main zio */
+ int ret;
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+ /*
+ * If this child failed, its ic_data will be NULL.
+ * Skip this combination.
+ */
+ if (is->is_child[is->is_good_child].ic_data == NULL) {
+ ret = EIO;
+ goto next;
+ }
+
+ abd_copy_off(zio->io_abd,
+ is->is_child[is->is_good_child].ic_data,
+ is->is_split_offset, 0, is->is_size);
+ }
+
+ /* See if this checksum matches. */
+ zio_bad_cksum_t zbc;
+ ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ /* Found a matching checksum. Issue repair i/os. */
+ vdev_indirect_repair(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * Checksum failed; try a different combination of split
+ * children.
+ */
+ boolean_t more;
+next:
+ more = B_FALSE;
+ if (segments <= zfs_reconstruct_indirect_segments_max) {
+ /*
+ * There are relatively few segments, so
+ * deterministically check all combinations. We do
+ * this by by adding one to the first split's
+ * good_child. If it overflows, then "carry over" to
+ * the next split (like counting in base is_children,
+ * but each digit can have a different base).
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child++;
+ if (is->is_good_child < is->is_children) {
+ more = B_TRUE;
+ break;
+ }
+ is->is_good_child = 0;
+ }
+ } else if (++attempts < attempts_max) {
+ /*
+ * There are too many combinations to try all of them
+ * in a reasonable amount of time, so try a fixed
+ * number of random combinations, after which we'll
+ * consider the block unrecoverable.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child =
+ spa_get_random(is->is_children);
+ }
+ more = B_TRUE;
+ }
+ if (!more) {
+ /* All combinations failed. */
+ zio->io_error = ret;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+ }
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (iv->iv_reconstruct) {
+ /*
+ * We have read all copies of the data (e.g. from mirrors),
+ * either because this was a scrub/resilver, or because the
+ * one-copy read didn't checksum correctly.
+ */
+ vdev_indirect_reconstruct_io_done(zio);
+ return;
+ }
+
+ if (!iv->iv_split_block) {
+ /*
+ * This was not a split block, so we passed the BP down,
+ * and the checksum was handled by the (one) child zio.
+ */
+ return;
+ }
+
+ zio_bad_cksum_t zbc;
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * The checksum didn't match. Read all copies of all splits, and
+ * then we will try to reconstruct. The next time
+ * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+ */
+ vdev_indirect_read_all(zio);
+
+ zio_vdev_io_redone(zio);
+}
+
vdev_ops_t vdev_indirect_ops = {
vdev_indirect_open,
vdev_indirect_close,
@@ -1061,4 +1576,8 @@ MODULE_PARM_DESC(zfs_condense_min_mapping_bytes,
module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
"Delay while condensing vdev mapping");
+
+module_param(zfs_reconstruct_indirect_segments_max, int, 0644);
+MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max,
+ "Maximum number of split segments check all combinations");
#endif
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index b56f955eb..4b01f317b 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -486,12 +486,15 @@ vdev_mirror_io_start(zio_t *zio)
mm = vdev_mirror_map_init(zio);
if (zio->io_type == ZIO_TYPE_READ) {
- if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+ if (zio->io_bp != NULL &&
+ (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
/*
- * For scrubbing reads we need to allocate a read
- * buffer for each child and issue reads to all
- * children. If any child succeeds, it will copy its
- * data into zio->io_data in vdev_mirror_scrub_done.
+ * For scrubbing reads (if we can verify the
+ * checksum here, as indicated by io_bp being
+ * non-NULL) we need to allocate a read buffer for
+ * each child and issue reads to all children. If
+ * any child succeeds, it will copy its data into
+ * zio->io_data in vdev_mirror_scrub_done.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -640,7 +643,21 @@ vdev_mirror_io_done(zio_t *zio)
if (mc->mc_error == 0) {
if (mc->mc_tried)
continue;
+ /*
+ * We didn't try this child. We need to
+ * repair it if:
+ * 1. it's a scrub (in which case we have
+ * tried everything that was healthy)
+ * - or -
+ * 2. it's an indirect vdev (in which case
+ * it could point to any other vdev, which
+ * might have a bad DTL)
+ * - or -
+ * 3. the DTL indicates that this data is
+ * missing from this vdev
+ */
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 6e81bf014..0fca8fb03 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -84,18 +84,12 @@ typedef struct vdev_copy_arg {
kmutex_t vca_lock;
} vdev_copy_arg_t;
-typedef struct vdev_copy_seg_arg {
- vdev_copy_arg_t *vcsa_copy_arg;
- uint64_t vcsa_txg;
- dva_t *vcsa_dest_dva;
- blkptr_t *vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
/*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal. This determines how much i/o we can have
+ * in flight concurrently.
*/
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
/*
* The largest contiguous segment that we will attempt to allocate when
@@ -165,7 +159,7 @@ spa_vdev_removal_create(vdev_t *vd)
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
svr->svr_allocd_segs = range_tree_create(NULL, NULL);
- svr->svr_vdev = vd;
+ svr->svr_vdev_id = vd->vdev_id;
for (int i = 0; i < TXG_SIZE; i++) {
svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -207,9 +201,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
static void
vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
{
- vdev_t *vd = arg;
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
- spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
spa_vdev_removal_t *svr = NULL;
ASSERTV(uint64_t txg = dmu_tx_get_txg(tx));
@@ -331,7 +326,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
ASSERT3P(spa->spa_vdev_removal, ==, NULL);
spa->spa_vdev_removal = svr;
svr->svr_thread = thread_create(NULL, 0,
- spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+ spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
}
/*
@@ -372,21 +367,24 @@ spa_remove_init(spa_t *spa)
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_t *vd = vdev_lookup_top(spa,
spa->spa_removing_phys.sr_removing_vdev);
- spa_config_exit(spa, SCL_STATE, FTAG);
- if (vd == NULL)
+ if (vd == NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
return (EINVAL);
+ }
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
ASSERT(vdev_is_concrete(vd));
spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
- ASSERT(svr->svr_vdev->vdev_removing);
+ ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+ ASSERT(vd->vdev_removing);
vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
spa->spa_meta_objset, vic->vic_mapping_object);
vd->vdev_indirect_births = vdev_indirect_births_open(
spa->spa_meta_objset, vic->vic_births_object);
+ spa_config_exit(spa, SCL_STATE, FTAG);
spa->spa_vdev_removal = svr;
}
@@ -439,15 +437,8 @@ spa_restart_removal(spa_t *spa)
if (!spa_writeable(spa))
return;
- vdev_t *vd = svr->svr_vdev;
- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
- ASSERT3P(vd, !=, NULL);
- ASSERT(vd->vdev_removing);
-
- zfs_dbgmsg("restarting removal of %llu at count=%llu",
- vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
- svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+ zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
0, &p0, TS_RUN, minclsyspri);
}
@@ -468,7 +459,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
vdev_indirect_mapping_object(vim));
- ASSERT3P(vd, ==, svr->svr_vdev);
+ ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
ASSERT3U(spa_syncing_txg(spa), ==, txg);
mutex_enter(&svr->svr_lock);
@@ -646,7 +637,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
if (state == DSS_FINISHED) {
spa_removing_phys_t *srp = &spa->spa_removing_phys;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -690,7 +681,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config);
uint64_t txg = dmu_tx_get_txg(tx);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -718,64 +709,128 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
spa_sync_removing_state(spa, tx);
}
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_nullzio_done(zio_t *zio)
+{
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
static void
spa_vdev_copy_segment_write_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
- spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+ vdev_copy_arg_t *vca = zio->io_private;
+
abd_free(zio->io_abd);
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;
cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
-
- ASSERT0(zio->io_error);
- kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
- kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
}
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- dva_t *dest_dva = vcsa->vcsa_dest_dva;
- uint64_t txg = vcsa->vcsa_txg;
- spa_t *spa = zio->io_spa;
- ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)));
- blkptr_t *bp = NULL;
- dva_t *dva = NULL;
- uint64_t size = zio->io_size;
-
- ASSERT3P(dest_vd, !=, NULL);
- ASSERT0(zio->io_error);
-
- vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- bp = vcsa->vcsa_dest_bp;
- dva = bp->blk_dva;
-
- BP_ZERO(bp);
-
- /* initialize with dest_dva */
- bcopy(dest_dva, dva, sizeof (dva_t));
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
- txg, bp, zio->io_abd, size,
- spa_vdev_copy_segment_write_done, vcsa,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible. Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs. However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads. If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ * null
+ * / \
+ * write(new vdev, child 0) write(new vdev, child 1)
+ * | |
+ * read(old vdev, child 0) read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete. However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete. In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*. We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+ vdev_t *source_vd, uint64_t source_offset,
+ vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+ ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+ vdev_t *source_child_vd;
+ if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+ /*
+ * Source and dest are both mirrors. Copy from the same
+ * child id as we are copying to (wrapping around if there
+ * are more dest children than source children).
+ */
+ source_child_vd =
+ source_vd->vdev_child[dest_id % source_vd->vdev_children];
+ } else {
+ source_child_vd = source_vd;
+ }
+
+ zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+ dest_child_vd, dest_offset, abd, size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_write_done, vca);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ source_child_vd, source_offset, abd, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_read_done, vca));
}
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
static int
spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
@@ -784,10 +839,7 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_entry_t *entry;
- vdev_copy_seg_arg_t *private;
dva_t dst = {{ 0 }};
- blkptr_t blk, *bp = &blk;
- dva_t *dva = bp->blk_dva;
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
@@ -804,51 +856,28 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
*/
ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
- mutex_enter(&vca->vca_lock);
- vca->vca_outstanding_bytes += size;
- mutex_exit(&vca->vca_lock);
-
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
entry->vime_mapping.vimep_dst = dst;
- private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
- private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
- private->vcsa_txg = txg;
- private->vcsa_copy_arg = vca;
-
/*
- * This lock is eventually released by the donefunc for the
- * zio_write_phys that finishes copying the data.
+ * See comment before spa_vdev_copy_one_child().
*/
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
- /*
- * Do logical I/O, letting the redundancy vdevs (like mirror)
- * handle their own I/O instead of duplicating that code here.
- */
- BP_ZERO(bp);
-
- DVA_SET_VDEV(&dva[0], vd->vdev_id);
- DVA_SET_OFFSET(&dva[0], start);
- DVA_SET_GANG(&dva[0], 0);
- DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
- bp, abd_alloc_for_io(size, B_FALSE), size,
- spa_vdev_copy_segment_read_done, private,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+ spa_vdev_copy_nullzio_done, NULL, 0);
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+ if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < dest_vd->vdev_children; i++) {
+ vdev_t *child = dest_vd->vdev_child[i];
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ child, DVA_GET_OFFSET(&dst), i, size);
+ }
+ } else {
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+ }
+ zio_nowait(nzio);
list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -866,8 +895,8 @@ static void
vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
- vdev_t *vd = svr->svr_vdev;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
@@ -896,37 +925,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
}
static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
- ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
- ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
- ASSERT(vd->vdev_indirect_mapping != NULL);
- ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
- vd->vdev_indirect_mapping = NULL;
-
- ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
- ASSERT(vd->vdev_indirect_births != NULL);
- ivd->vdev_indirect_births = vd->vdev_indirect_births;
- vd->vdev_indirect_births = NULL;
-
- ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
- ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
- if (vd->vdev_obsolete_sm != NULL) {
- ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
- /*
- * We cannot use space_map_{open,close} because we hold all
- * the config locks as writer.
- */
- ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
- ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
- vd->vdev_obsolete_sm = NULL;
- }
-}
-
-static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
ASSERT3P(zlist, !=, NULL);
@@ -961,17 +959,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+ ivd->vdev_removing = 0;
vd->vdev_leaf_zap = 0;
vdev_remove_child(ivd, vd);
vdev_compact_children(ivd);
- vdev_indirect_state_transfer(ivd, vd);
-
- svr->svr_vdev = ivd;
-
- ASSERT(!ivd->vdev_removing);
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -994,9 +988,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
* context by the removal thread after we have copied all vdev's data.
*/
static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
{
- spa_t *spa = vd->vdev_spa;
uint64_t txg;
/*
@@ -1004,8 +997,12 @@ vdev_remove_complete(vdev_t *vd)
* vdev_metaslab_fini()
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
-
txg = spa_vdev_enter(spa);
+ vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
vd->vdev_id, txg);
@@ -1025,6 +1022,10 @@ vdev_remove_complete(vdev_t *vd)
/*
* We now release the locks, allowing spa_sync to run and finish the
* removal via vdev_remove_complete_sync in syncing context.
+ *
+ * Note that we hold on to the vdev_t that has been replaced. Since
+ * it isn't part of the vdev tree any longer, it can't be concurrently
+ * manipulated, even while we don't have the config lock.
*/
(void) spa_vdev_exit(spa, NULL, txg, 0);
@@ -1046,6 +1047,9 @@ vdev_remove_complete(vdev_t *vd)
*/
vdev_config_dirty(spa->spa_root_vdev);
(void) spa_vdev_exit(spa, vd, txg, 0);
+
+ if (ev != NULL)
+ spa_event_post(ev);
}
/*
@@ -1056,7 +1060,7 @@ vdev_remove_complete(vdev_t *vd)
* this size again this txg.
*/
static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
uint64_t *max_alloc, dmu_tx_t *tx)
{
uint64_t txg = dmu_tx_get_txg(tx);
@@ -1095,7 +1099,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
while (length > 0) {
uint64_t mylen = MIN(length, thismax);
- int error = spa_vdev_copy_segment(svr->svr_vdev,
+ int error = spa_vdev_copy_segment(vd,
offset, mylen, txg, vca, &zal);
if (error == ENOSPC) {
@@ -1153,12 +1157,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
static void
spa_vdev_remove_thread(void *arg)
{
- vdev_t *vd = arg;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = arg;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_copy_arg_t vca;
uint64_t max_alloc = zfs_remove_max_segment;
uint64_t last_txg = 0;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
@@ -1166,7 +1172,6 @@ spa_vdev_remove_thread(void *arg)
ASSERT(vdev_is_concrete(vd));
ASSERT(vd->vdev_removing);
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
- ASSERT3P(svr->svr_vdev, ==, vd);
ASSERT(vim != NULL);
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1247,6 +1252,17 @@ spa_vdev_remove_thread(void *arg)
mutex_exit(&svr->svr_lock);
+ /*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
mutex_enter(&vca.vca_lock);
while (vca.vca_outstanding_bytes >
zfs_remove_max_copy_bytes) {
@@ -1260,11 +1276,19 @@ spa_vdev_remove_thread(void *arg)
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
uint64_t txg = dmu_tx_get_txg(tx);
+ /*
+ * Reacquire the vdev_config lock. The vdev_t
+ * that we're removing may have changed, e.g. due
+ * to a vdev_attach or vdev_detach.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
if (txg != last_txg)
max_alloc = zfs_remove_max_segment;
last_txg = txg;
- spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+ spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
@@ -1272,6 +1296,9 @@ spa_vdev_remove_thread(void *arg)
}
mutex_exit(&svr->svr_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
/*
* Wait for all copies to finish before cleaning up the vca.
*/
@@ -1289,7 +1316,7 @@ spa_vdev_remove_thread(void *arg)
mutex_exit(&svr->svr_lock);
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
- vdev_remove_complete(vd);
+ vdev_remove_complete(spa);
}
}
@@ -1330,7 +1357,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
objset_t *mos = spa->spa_meta_objset;
@@ -1403,8 +1430,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
* because we have not allocated mappings for it yet.
*/
uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
- range_tree_clear(svr->svr_allocd_segs, syncd,
- msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > syncd)
+ range_tree_clear(svr->svr_allocd_segs,
+ syncd, sm_end - syncd);
mutex_exit(&svr->svr_lock);
}
@@ -1465,7 +1495,7 @@ spa_vdev_remove_cancel(spa_t *spa)
if (spa->spa_vdev_removal == NULL)
return (ENOTACTIVE);
- uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
@@ -1774,7 +1804,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
dsl_sync_task_nowait(spa->spa_dsl_pool,
vdev_remove_initiate_sync,
- vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
dmu_tx_commit(tx);
return (0);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 5259a0c6f..9bb2459bb 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1212,17 +1212,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
done != NULL);
- /*
- * In the common case, where the parent zio was to a normal vdev,
- * the child zio must be to a child vdev of that vdev. Otherwise,
- * the child zio must be to a top-level vdev.
- */
- if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
- ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
- } else {
- ASSERT3P(vd, ==, vd->vdev_top);
- }
-
if (type == ZIO_TYPE_READ && bp != NULL) {
/*
* If we have the bp, then the child should perform the
@@ -1283,7 +1272,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
- int type, zio_priority_t priority, enum zio_flag flags,
+ zio_type_t type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -3481,7 +3470,7 @@ zio_vdev_io_start(zio_t *zio)
*/
ASSERT(zio->io_flags &
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
- ZIO_FLAG_INDUCE_DAMAGE));
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
}
align = 1ULL << vd->vdev_top->vdev_ashift;
@@ -3521,18 +3510,37 @@ zio_vdev_io_start(zio_t *zio)
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
- * This prevents spurious resilvering with nested replication.
- * For example, given a mirror of mirrors, (A+B)+(C+D), if only
- * A is out of date, we'll read from C+D, then use the data to
- * resilver A+B -- but we don't actually want to resilver B, just A.
- * The top-level mirror has no way to know this, so instead we just
- * discard unnecessary repairs as we work our way down the vdev tree.
- * The same logic applies to any form of nested replication:
- * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
+ * This prevents spurious resilvering.
+ *
+ * There are a few ways that we can end up creating these spurious
+ * resilver i/os:
+ *
+ * 1. A resilver i/o will be issued if any DVA in the BP has a
+ * dirty DTL. The mirror code will issue resilver writes to
+ * each DVA, including the one(s) that are not on vdevs with dirty
+ * DTLs.
+ *
+ * 2. With nested replication, which happens when we have a
+ * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+ * For example, given mirror(replacing(A+B), C), it's likely that
+ * only A is out of date (it's the new device). In this case, we'll
+ * read from C, then use the data to resilver A+B -- but we don't
+ * actually want to resilver B, just A. The top-level mirror has no
+ * way to know this, so instead we just discard unnecessary repairs
+ * as we work our way down the vdev tree.
+ *
+ * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+ * The same logic applies to any form of nested replication: ditto
+ * + mirror, RAID-Z + replacing, etc.
+ *
+ * However, indirect vdevs point off to other vdevs which may have
+ * DTL's, so we never bypass them. The child i/os on concrete vdevs
+ * will be properly bypassed instead.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
+ vd->vdev_ops != &vdev_indirect_ops &&
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);