summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/ztest/ztest.c3
-rw-r--r--man/man5/zfs-module-parameters.524
-rw-r--r--module/zfs/vdev_indirect.c124
3 files changed, 103 insertions, 48 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index c81d446a5..315169571 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -6221,7 +6221,8 @@ ztest_run_zdb(char *pool)
ztest_get_zdb_bin(bin, len);
(void) sprintf(zdb,
- "%s -bcc%s%s -G -d -U %s %s",
+ "%s -bcc%s%s -G -d -U %s "
+ "-o zfs_reconstruct_indirect_combinations_max=1000000 %s",
bin,
ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "",
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 6e670facb..8b007ea17 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1742,18 +1742,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp
.ne 2
.na
-\fBzfs_reconstruct_indirect_segments_max\fR (int)
-.ad
-.RS 12n
-When a split block which is part of a indirect vdev contains more than this
-many segments, consider it too computationally expensive to check all possible
-combinations. Instead, operate under the assumption that only a few segment
-copies are damaged and the majority of segment copies are good, in which case
-it is reasonable to randomly select sample combinations. This allows all the
-segment copies to participate fairly in the reconstruction and prevents the
-repeated use of one bad copy.
-.sp
-Default value: \fB10\fR.
+\fBzfs_reconstruct_indirect_combinations_max\fR (int)
+.ad
+.RS 12na
+If an indirect split block contains more than this many possible unique
+combinations when being reconstructed, consider it too computationally
+expensive to check them all. Instead, try at most
+\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
+combinations each time the block is accessed. This allows all segment
+copies to participate fairly in the reconstruction when all combinations
+cannot be checked and prevents repeated use of one bad copy.
+.sp
+Default value: \fB100\fR.
.RE
.sp
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 3ccdfee3b..b30ddaf26 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -204,17 +204,14 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
int zfs_condense_indirect_commit_entry_delay_ms = 0;
/*
- * If a split block contains more than this many segments, consider it too
- * computationally expensive to check all (2^num_segments) possible
- * combinations. Instead, try at most 2^_segments_max randomly-selected
- * combinations.
- *
- * This is reasonable if only a few segment copies are damaged and the
- * majority of segment copies are good. It allows all segment copies to
- * participate fairly in the reconstruction and prevents repeated use of
- * one bad copy.
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed. This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
*/
-int zfs_reconstruct_indirect_segments_max = 10;
+int zfs_reconstruct_indirect_combinations_max = 100;
/*
* The indirect_child_t represents the vdev that we will read from, when we
@@ -226,6 +223,12 @@ int zfs_reconstruct_indirect_segments_max = 10;
typedef struct indirect_child {
abd_t *ic_data;
vdev_t *ic_vdev;
+
+ /*
+ * ic_duplicate is -1 when the ic_data contents are unique, when it
+ * is determined to be a duplicate it refers to the primary child.
+ */
+ int ic_duplicate;
} indirect_child_t;
/*
@@ -1165,6 +1168,7 @@ vdev_indirect_read_all(zio_t *zio)
ic->ic_data = abd_alloc_sametype(zio->io_abd,
is->is_size);
+ ic->ic_duplicate = -1;
zio_nowait(zio_vdev_child_io(zio, NULL,
ic->ic_vdev, is->is_target_offset, ic->ic_data,
@@ -1314,7 +1318,7 @@ vdev_indirect_repair(zio_t *zio)
continue;
if (ic->ic_data == NULL)
continue;
- if (abd_cmp(good_child->ic_data, ic->ic_data) == 0)
+ if (ic->ic_duplicate == is->is_good_child)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL,
@@ -1367,15 +1371,16 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
*
* If we pointed to any mirror vdevs, this effectively does the job of the
* mirror. The mirror vdev code can't do its own job because we don't know
- * the checksum of each split segment individually. We have to try every
- * combination of copies of split segments, until we find one that checksums
- * correctly. (Or until we have tried all combinations, or have tried
- * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
- * set io_error to ECKSUM to propagate the error up to the user.)
+ * the checksum of each split segment individually.
*
- * For example, if we have 3 segments in the split,
- * and each points to a 2-way mirror, we will have the following pieces of
- * data:
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly. Duplicate segment copies are first
+ * discarded as an optimization to reduce the search space. After pruning
+ * there will exist at most one valid combination.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
*
* | mirror child
* split | [0] [1]
@@ -1414,12 +1419,48 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
{
indirect_vsd_t *iv = zio->io_vsd;
uint64_t attempts = 0;
- uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
- int segments = 0;
+ uint64_t attempts_max = UINT64_MAX;
+ uint64_t combinations = 1;
+
+ if (zfs_reconstruct_indirect_combinations_max > 0)
+ attempts_max = zfs_reconstruct_indirect_combinations_max;
+ /*
+ * Discard duplicate copies of split segments to minimize the
+ * number of unique combinations when attempting reconstruction.
+ */
for (indirect_split_t *is = list_head(&iv->iv_splits);
- is != NULL; is = list_next(&iv->iv_splits, is))
- segments++;
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ uint64_t is_copies = 0;
+
+ for (int i = 0; i < is->is_children; i++) {
+ if (is->is_child[i].ic_data == NULL)
+ continue;
+
+ for (int j = i + 1; j < is->is_children; j++) {
+ if (is->is_child[j].ic_data == NULL)
+ continue;
+
+ if (is->is_child[j].ic_duplicate == -1 &&
+ abd_cmp(is->is_child[i].ic_data,
+ is->is_child[j].ic_data) == 0) {
+ is->is_child[j].ic_duplicate = i;
+ }
+ }
+
+ is_copies++;
+ }
+
+ /* Reconstruction is impossible, no valid is->is_child[] */
+ if (is_copies == 0) {
+ zio->io_error = EIO;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ combinations *= is_copies;
+ }
for (;;) {
/* copy data from splits to main zio */
@@ -1436,6 +1477,15 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
goto next;
}
+ /*
+ * If this child is a duplicate, its is_duplicate will
+ * refer to the primary copy. Skip this combination.
+ */
+ if (is->is_child[is->is_good_child].ic_duplicate >= 0) {
+ ret = ECKSUM;
+ goto next;
+ }
+
abd_copy_off(zio->io_abd,
is->is_child[is->is_good_child].ic_data,
is->is_split_offset, 0, is->is_size);
@@ -1458,14 +1508,14 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
boolean_t more;
next:
more = B_FALSE;
- if (segments <= zfs_reconstruct_indirect_segments_max) {
+ if (combinations <= attempts_max) {
/*
- * There are relatively few segments, so
- * deterministically check all combinations. We do
- * this by by adding one to the first split's
- * good_child. If it overflows, then "carry over" to
- * the next split (like counting in base is_children,
- * but each digit can have a different base).
+ * There are relatively few possible combinations, so
+ * deterministically check them all. We do this by
+ * adding one to the first split's good_child. If it
+ * overflows, then "carry over" to the next split
+ * (like counting in base is_children, but each
+ * digit can have a different base).
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
@@ -1485,8 +1535,12 @@ next:
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
- is->is_good_child =
- spa_get_random(is->is_children);
+ int c = spa_get_random(is->is_children);
+
+ while (is->is_child[c].ic_duplicate >= 0)
+ c = (c + 1) % is->is_children;
+
+ is->is_good_child = c;
}
more = B_TRUE;
}
@@ -1577,7 +1631,7 @@ module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
"Delay while condensing vdev mapping");
-module_param(zfs_reconstruct_indirect_segments_max, int, 0644);
-MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max,
- "Maximum number of split segments check all combinations");
+module_param(zfs_reconstruct_indirect_combinations_max, int, 0644);
+MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max,
+ "Maximum number of combinations when reconstructing split segments");
#endif