aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2021-05-20 15:05:26 -0700
committerGitHub <[email protected]>2021-05-20 15:05:26 -0700
commit8fb577ae6da03ed72edca9adad027779e69db146 (patch)
tree1e8ce6bc6aa49ad0fa62cac9f7dc569106093d46 /module/zfs
parent6ac2d7f76f8b56f0d5869c60c0b49b15018d1a53 (diff)
Fix dRAID sequential resilver silent damage handling
This change addresses two distinct scenarios which are possible when performing a sequential resilver to a dRAID pool with vdevs that contain silent unknown damage. Which in this circumstance took the form of the devices being intentionally overwritten with zeros. However, it could also result from a device returning incorrect data while a sequential resilver was in progress. Scenario 1) A sequential resilver is performed while all of the dRAID vdevs are ONLINE and there is silent damage present on the vdev being resilvered. In this case, nothing will be repaired by vdev_raidz_io_done_reconstruct_known_missing() because rc->rc_error isn't set on any of the raid columns. To address this vdev_draid_io_start_read() has been updated to always mark the resilvering column as ESTALE for sequential resilver IO. Scenario 2) Multiple columns contain silent damage for the same block and a sequential resilver is performed. In this case it's impossible to generate the correct data from parity unless all of the damaged columns are being sequentially resilvered (and thus only good data is used to generate parity). This is as expected and there's nothing which can be done about it. However, we need to be careful not to make to situation worse. Since we can't verify the data is actually good without a checksum, we must only repair the devices which are being sequentially resilvered. Otherwise, an incorrect repair to a device which previously contained good data could effectively lock in the damage and make reconstruction impossible. A check for this was added to vdev_raidz_io_done_verified() along with a new test case. Lastly, this change updates the redundancy_draid_spare1 and redundancy_draid_spare3 test cases to be more representative of normal dRAID replacement operation. Specifically, what we care about is that the scrub run after a sequential resilver does not find additional blocks which need repair. This would indicate the sequential resilver failed to rebuild a section of one of the devices. Note also the tests were switched to using the verify_pool() function which still checks for checksum errors. Reviewed-by: Mark Maybee <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #12061
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/vdev_draid.c41
-rw-r--r--module/zfs/vdev_raidz.c9
2 files changed, 44 insertions, 6 deletions
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index c65ce1cd6..20b1457f0 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1009,7 +1009,8 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
rc->rc_error = 0;
rc->rc_tried = 0;
rc->rc_skipped = 0;
- rc->rc_repair = 0;
+ rc->rc_force_repair = 0;
+ rc->rc_allow_repair = 1;
rc->rc_need_orig_restore = B_FALSE;
if (q == 0 && i >= bc)
@@ -1891,6 +1892,36 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
vdev_t *svd;
/*
+ * Sequential rebuilds need to always consider the data
+ * on the child being rebuilt to be stale. This is
+ * important when all columns are available to aid
+ * known reconstruction in identifing which columns
+ * contain incorrect data.
+ *
+ * Furthermore, all repairs need to be constrained to
+ * the devices being rebuilt because without a checksum
+ * we cannot verify the data is actually correct and
+ * performing an incorrect repair could result in
+ * locking in damage and making the data unrecoverable.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
+ if (vdev_draid_rebuilding(cvd)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ESTALE);
+ rc->rc_skipped = 1;
+ rc->rc_allow_repair = 1;
+ continue;
+ } else {
+ rc->rc_allow_repair = 0;
+ }
+ } else {
+ rc->rc_allow_repair = 1;
+ }
+
+ /*
* If this child is a distributed spare then the
* offset might reside on the vdev being replaced.
* In which case this data must be written to the
@@ -1903,7 +1934,10 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_offset);
if (svd && (svd->vdev_ops == &vdev_spare_ops ||
svd->vdev_ops == &vdev_replacing_ops)) {
- rc->rc_repair = 1;
+ rc->rc_force_repair = 1;
+
+ if (vdev_draid_rebuilding(svd))
+ rc->rc_allow_repair = 1;
}
}
@@ -1914,7 +1948,8 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
if ((cvd->vdev_ops == &vdev_spare_ops ||
cvd->vdev_ops == &vdev_replacing_ops) &&
vdev_draid_rebuilding(cvd)) {
- rc->rc_repair = 1;
+ rc->rc_force_repair = 1;
+ rc->rc_allow_repair = 1;
}
}
}
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 020b3bc95..1feebf708 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -269,7 +269,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rc->rc_error = 0;
rc->rc_tried = 0;
rc->rc_skipped = 0;
- rc->rc_repair = 0;
+ rc->rc_force_repair = 0;
+ rc->rc_allow_repair = 1;
rc->rc_need_orig_restore = B_FALSE;
if (c >= acols)
@@ -1811,8 +1812,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
vdev_t *vd = zio->io_vd;
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
- if ((rc->rc_error == 0 || rc->rc_size == 0) &&
- (rc->rc_repair == 0)) {
+ if (!rc->rc_allow_repair) {
+ continue;
+ } else if (!rc->rc_force_repair &&
+ (rc->rc_error == 0 || rc->rc_size == 0)) {
continue;
}