summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2018-03-29 14:50:40 -0700
committerBrian Behlendorf <[email protected]>2018-04-14 12:22:43 -0700
commit4589f3ae4c1bb435777da8640eb915f3c713b14d (patch)
treef37cbdab9cd98778a1aceac9cb2b87a302920983
parent9e052db4627ca945db1e3fa63ed81b156d9d7562 (diff)
Optimize possible split block search space
Remove duplicate segment copies to minimize the possible search space for reconstruction. Once reduced an accurate assessment can be made regarding the difficulty in reconstructing the block. Also, ztest will now run zdb with zfs_reconstruct_indirect_combinations_max set to 1000000 in an attempt to avoid checksum errors. Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: Tim Chase <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #6900
-rw-r--r--cmd/ztest/ztest.c3
-rw-r--r--man/man5/zfs-module-parameters.524
-rw-r--r--module/zfs/vdev_indirect.c124
3 files changed, 103 insertions, 48 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index c81d446a5..315169571 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -6221,7 +6221,8 @@ ztest_run_zdb(char *pool)
ztest_get_zdb_bin(bin, len);
(void) sprintf(zdb,
- "%s -bcc%s%s -G -d -U %s %s",
+ "%s -bcc%s%s -G -d -U %s "
+ "-o zfs_reconstruct_indirect_combinations_max=1000000 %s",
bin,
ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "",
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 6e670facb..8b007ea17 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1742,18 +1742,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp
.ne 2
.na
-\fBzfs_reconstruct_indirect_segments_max\fR (int)
-.ad
-.RS 12n
-When a split block which is part of a indirect vdev contains more than this
-many segments, consider it too computationally expensive to check all possible
-combinations. Instead, operate under the assumption that only a few segment
-copies are damaged and the majority of segment copies are good, in which case
-it is reasonable to randomly select sample combinations. This allows all the
-segment copies to participate fairly in the reconstruction and prevents the
-repeated use of one bad copy.
-.sp
-Default value: \fB10\fR.
+\fBzfs_reconstruct_indirect_combinations_max\fR (int)
+.ad
+.RS 12na
+If an indirect split block contains more than this many possible unique
+combinations when being reconstructed, consider it too computationally
+expensive to check them all. Instead, try at most
+\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
+combinations each time the block is accessed. This allows all segment
+copies to participate fairly in the reconstruction when all combinations
+cannot be checked and prevents repeated use of one bad copy.
+.sp
+Default value: \fB100\fR.
.RE
.sp
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 3ccdfee3b..b30ddaf26 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -204,17 +204,14 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
int zfs_condense_indirect_commit_entry_delay_ms = 0;
/*
- * If a split block contains more than this many segments, consider it too
- * computationally expensive to check all (2^num_segments) possible
- * combinations. Instead, try at most 2^_segments_max randomly-selected
- * combinations.
- *
- * This is reasonable if only a few segment copies are damaged and the
- * majority of segment copies are good. It allows all segment copies to
- * participate fairly in the reconstruction and prevents repeated use of
- * one bad copy.
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed. This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
*/
-int zfs_reconstruct_indirect_segments_max = 10;
+int zfs_reconstruct_indirect_combinations_max = 100;
/*
* The indirect_child_t represents the vdev that we will read from, when we
@@ -226,6 +223,12 @@ int zfs_reconstruct_indirect_segments_max = 10;
typedef struct indirect_child {
abd_t *ic_data;
vdev_t *ic_vdev;
+
+ /*
+ * ic_duplicate is -1 when the ic_data contents are unique, when it
+ * is determined to be a duplicate it refers to the primary child.
+ */
+ int ic_duplicate;
} indirect_child_t;
/*
@@ -1165,6 +1168,7 @@ vdev_indirect_read_all(zio_t *zio)
ic->ic_data = abd_alloc_sametype(zio->io_abd,
is->is_size);
+ ic->ic_duplicate = -1;
zio_nowait(zio_vdev_child_io(zio, NULL,
ic->ic_vdev, is->is_target_offset, ic->ic_data,
@@ -1314,7 +1318,7 @@ vdev_indirect_repair(zio_t *zio)
continue;
if (ic->ic_data == NULL)
continue;
- if (abd_cmp(good_child->ic_data, ic->ic_data) == 0)
+ if (ic->ic_duplicate == is->is_good_child)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL,
@@ -1367,15 +1371,16 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
*
* If we pointed to any mirror vdevs, this effectively does the job of the
* mirror. The mirror vdev code can't do its own job because we don't know
- * the checksum of each split segment individually. We have to try every
- * combination of copies of split segments, until we find one that checksums
- * correctly. (Or until we have tried all combinations, or have tried
- * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
- * set io_error to ECKSUM to propagate the error up to the user.)
+ * the checksum of each split segment individually.
*
- * For example, if we have 3 segments in the split,
- * and each points to a 2-way mirror, we will have the following pieces of
- * data:
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly. Duplicate segment copies are first
+ * discarded as an optimization to reduce the search space. After pruning
+ * there will exist at most one valid combination.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
*
* | mirror child
* split | [0] [1]
@@ -1414,12 +1419,48 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
{
indirect_vsd_t *iv = zio->io_vsd;
uint64_t attempts = 0;
- uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
- int segments = 0;
+ uint64_t attempts_max = UINT64_MAX;
+ uint64_t combinations = 1;
+
+ if (zfs_reconstruct_indirect_combinations_max > 0)
+ attempts_max = zfs_reconstruct_indirect_combinations_max;
+ /*
+ * Discard duplicate copies of split segments to minimize the
+ * number of unique combinations when attempting reconstruction.
+ */
for (indirect_split_t *is = list_head(&iv->iv_splits);
- is != NULL; is = list_next(&iv->iv_splits, is))
- segments++;
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ uint64_t is_copies = 0;
+
+ for (int i = 0; i < is->is_children; i++) {
+ if (is->is_child[i].ic_data == NULL)
+ continue;
+
+ for (int j = i + 1; j < is->is_children; j++) {
+ if (is->is_child[j].ic_data == NULL)
+ continue;
+
+ if (is->is_child[j].ic_duplicate == -1 &&
+ abd_cmp(is->is_child[i].ic_data,
+ is->is_child[j].ic_data) == 0) {
+ is->is_child[j].ic_duplicate = i;
+ }
+ }
+
+ is_copies++;
+ }
+
+ /* Reconstruction is impossible, no valid is->is_child[] */
+ if (is_copies == 0) {
+ zio->io_error = EIO;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ combinations *= is_copies;
+ }
for (;;) {
/* copy data from splits to main zio */
@@ -1436,6 +1477,15 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
goto next;
}
+ /*
+ * If this child is a duplicate, its is_duplicate will
+ * refer to the primary copy. Skip this combination.
+ */
+ if (is->is_child[is->is_good_child].ic_duplicate >= 0) {
+ ret = ECKSUM;
+ goto next;
+ }
+
abd_copy_off(zio->io_abd,
is->is_child[is->is_good_child].ic_data,
is->is_split_offset, 0, is->is_size);
@@ -1458,14 +1508,14 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
boolean_t more;
next:
more = B_FALSE;
- if (segments <= zfs_reconstruct_indirect_segments_max) {
+ if (combinations <= attempts_max) {
/*
- * There are relatively few segments, so
- * deterministically check all combinations. We do
- * this by by adding one to the first split's
- * good_child. If it overflows, then "carry over" to
- * the next split (like counting in base is_children,
- * but each digit can have a different base).
+ * There are relatively few possible combinations, so
+ * deterministically check them all. We do this by
+ * adding one to the first split's good_child. If it
+ * overflows, then "carry over" to the next split
+ * (like counting in base is_children, but each
+ * digit can have a different base).
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
@@ -1485,8 +1535,12 @@ next:
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
- is->is_good_child =
- spa_get_random(is->is_children);
+ int c = spa_get_random(is->is_children);
+
+ while (is->is_child[c].ic_duplicate >= 0)
+ c = (c + 1) % is->is_children;
+
+ is->is_good_child = c;
}
more = B_TRUE;
}
@@ -1577,7 +1631,7 @@ module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
"Delay while condensing vdev mapping");
-module_param(zfs_reconstruct_indirect_segments_max, int, 0644);
-MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max,
- "Maximum number of split segments check all combinations");
+module_param(zfs_reconstruct_indirect_combinations_max, int, 0644);
+MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max,
+ "Maximum number of combinations when reconstructing split segments");
#endif