diff options
-rw-r--r-- | cmd/ztest/ztest.c | 3 | ||||
-rw-r--r-- | man/man5/zfs-module-parameters.5 | 24 | ||||
-rw-r--r-- | module/zfs/vdev_indirect.c | 124 |
3 files changed, 103 insertions, 48 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index c81d446a5..315169571 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6221,7 +6221,8 @@ ztest_run_zdb(char *pool) ztest_get_zdb_bin(bin, len); (void) sprintf(zdb, - "%s -bcc%s%s -G -d -U %s %s", + "%s -bcc%s%s -G -d -U %s " + "-o zfs_reconstruct_indirect_combinations_max=1000000 %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 6e670facb..8b007ea17 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1742,18 +1742,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default). .sp .ne 2 .na -\fBzfs_reconstruct_indirect_segments_max\fR (int) -.ad -.RS 12n -When a split block which is part of a indirect vdev contains more than this -many segments, consider it too computationally expensive to check all possible -combinations. Instead, operate under the assumption that only a few segment -copies are damaged and the majority of segment copies are good, in which case -it is reasonable to randomly select sample combinations. This allows all the -segment copies to participate fairly in the reconstruction and prevents the -repeated use of one bad copy. -.sp -Default value: \fB10\fR. +\fBzfs_reconstruct_indirect_combinations_max\fR (int) +.ad +.RS 12na +If an indirect split block contains more than this many possible unique +combinations when being reconstructed, consider it too computationally +expensive to check them all. Instead, try at most +\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected +combinations each time the block is accessed. This allows all segment +copies to participate fairly in the reconstruction when all combinations +cannot be checked and prevents repeated use of one bad copy. +.sp +Default value: \fB100\fR. .RE .sp diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 3ccdfee3b..b30ddaf26 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -204,17 +204,14 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; int zfs_condense_indirect_commit_entry_delay_ms = 0; /* - * If a split block contains more than this many segments, consider it too - * computationally expensive to check all (2^num_segments) possible - * combinations. Instead, try at most 2^_segments_max randomly-selected - * combinations. - * - * This is reasonable if only a few segment copies are damaged and the - * majority of segment copies are good. It allows all segment copies to - * participate fairly in the reconstruction and prevents repeated use of - * one bad copy. + * If an indirect split block contains more than this many possible unique + * combinations when being reconstructed, consider it too computationally + * expensive to check them all. Instead, try at most 100 randomly-selected + * combinations each time the block is accessed. This allows all segment + * copies to participate fairly in the reconstruction when all combinations + * cannot be checked and prevents repeated use of one bad copy. */ -int zfs_reconstruct_indirect_segments_max = 10; +int zfs_reconstruct_indirect_combinations_max = 100; /* * The indirect_child_t represents the vdev that we will read from, when we @@ -226,6 +223,12 @@ int zfs_reconstruct_indirect_segments_max = 10; typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; + + /* + * ic_duplicate is -1 when the ic_data contents are unique, when it + * is determined to be a duplicate it refers to the primary child. + */ + int ic_duplicate; } indirect_child_t; /* @@ -1165,6 +1168,7 @@ vdev_indirect_read_all(zio_t *zio) ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); + ic->ic_duplicate = -1; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, @@ -1314,7 +1318,7 @@ vdev_indirect_repair(zio_t *zio) continue; if (ic->ic_data == NULL) continue; - if (abd_cmp(good_child->ic_data, ic->ic_data) == 0) + if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, @@ -1367,15 +1371,16 @@ vdev_indirect_all_checksum_errors(zio_t *zio) * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know - * the checksum of each split segment individually. We have to try every - * combination of copies of split segments, until we find one that checksums - * correctly. (Or until we have tried all combinations, or have tried - * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we - * set io_error to ECKSUM to propagate the error up to the user.) + * the checksum of each split segment individually. * - * For example, if we have 3 segments in the split, - * and each points to a 2-way mirror, we will have the following pieces of - * data: + * We have to try every unique combination of copies of split segments, until + * we find one that checksums correctly. Duplicate segment copies are first + * discarded as an optimization to reduce the search space. After pruning + * there will exist at most one valid combination. + * + * When the total number of combinations is small they can all be checked. + * For example, if we have 3 segments in the split, and each points to a + * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] @@ -1414,12 +1419,48 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; uint64_t attempts = 0; - uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; - int segments = 0; + uint64_t attempts_max = UINT64_MAX; + uint64_t combinations = 1; + + if (zfs_reconstruct_indirect_combinations_max > 0) + attempts_max = zfs_reconstruct_indirect_combinations_max; + /* + * Discard duplicate copies of split segments to minimize the + * number of unique combinations when attempting reconstruction. + */ for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) - segments++; + is != NULL; is = list_next(&iv->iv_splits, is)) { + uint64_t is_copies = 0; + + for (int i = 0; i < is->is_children; i++) { + if (is->is_child[i].ic_data == NULL) + continue; + + for (int j = i + 1; j < is->is_children; j++) { + if (is->is_child[j].ic_data == NULL) + continue; + + if (is->is_child[j].ic_duplicate == -1 && + abd_cmp(is->is_child[i].ic_data, + is->is_child[j].ic_data) == 0) { + is->is_child[j].ic_duplicate = i; + } + } + + is_copies++; + } + + /* Reconstruction is impossible, no valid is->is_child[] */ + if (is_copies == 0) { + zio->io_error = EIO; + vdev_indirect_all_checksum_errors(zio); + zio_checksum_verified(zio); + return; + } + + combinations *= is_copies; + } for (;;) { /* copy data from splits to main zio */ @@ -1436,6 +1477,15 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) goto next; } + /* + * If this child is a duplicate, its is_duplicate will + * refer to the primary copy. Skip this combination. + */ + if (is->is_child[is->is_good_child].ic_duplicate >= 0) { + ret = ECKSUM; + goto next; + } + abd_copy_off(zio->io_abd, is->is_child[is->is_good_child].ic_data, is->is_split_offset, 0, is->is_size); @@ -1458,14 +1508,14 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) boolean_t more; next: more = B_FALSE; - if (segments <= zfs_reconstruct_indirect_segments_max) { + if (combinations <= attempts_max) { /* - * There are relatively few segments, so - * deterministically check all combinations. We do - * this by by adding one to the first split's - * good_child. If it overflows, then "carry over" to - * the next split (like counting in base is_children, - * but each digit can have a different base). + * There are relatively few possible combinations, so + * deterministically check them all. We do this by + * adding one to the first split's good_child. If it + * overflows, then "carry over" to the next split + * (like counting in base is_children, but each + * digit can have a different base). */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { @@ -1485,8 +1535,12 @@ next: */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { - is->is_good_child = - spa_get_random(is->is_children); + int c = spa_get_random(is->is_children); + + while (is->is_child[c].ic_duplicate >= 0) + c = (c + 1) % is->is_children; + + is->is_good_child = c; } more = B_TRUE; } @@ -1577,7 +1631,7 @@ module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644); MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, "Delay while condensing vdev mapping"); -module_param(zfs_reconstruct_indirect_segments_max, int, 0644); -MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max, - "Maximum number of split segments check all combinations"); +module_param(zfs_reconstruct_indirect_combinations_max, int, 0644); +MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max, + "Maximum number of combinations when reconstructing split segments"); #endif |