aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/dsl_scan.c
diff options
context:
space:
mode:
authorJohn Poduska <[email protected]>2020-05-13 13:54:27 -0400
committerGitHub <[email protected]>2020-05-13 10:54:27 -0700
commit41035a049643ff7083a6cb6cd43b8eb70a7d18a1 (patch)
tree121cec91a2b10ca6e9fb7d376a7599d47f4aee8c /module/zfs/dsl_scan.c
parentb29e31d80d6cb78dbd889e9b529333944b4c3ba1 (diff)
Resilver restarts unnecessarily when it encounters errors
When a resilver finishes, vdev_dtl_reassess is called to hopefully excise DTL_MISSING (amongst other things). If there are errors during the resilver, they are tracked in DTL_SCRUB, as spelled out in the block comment in vdev.c. DTL_SCRUB is in-core only, so it can only be used if the pool was online for the whole resilver. This state is tracked with the spa_scrub_started flag, which only gets set when the scan is initialized. Unfortunately, this flag gets cleared right before vdev_dtl_reassess gets called, so if there are any errors during the scan, DTL_MISSING will never get excised and the resilver will just continually restart. This fix simply moves clearing that flag until after the call to vdev_dtl_reasses. In addition, if a pool is imported and already has scn_errors > 0, this change will restart the resilver immediately instead of doing the rest of the scan and then restarting it from the beginning. On the other hand, if scn_errors == 0 at import, then no errors have been encountered so far, so the spa_scrub_started flag can be safely set. A test has been added to verify that resilver does not restart when relevant DTL's are available. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Zuchowski <[email protected]> Signed-off-by: John Poduska <[email protected]> Closes #10291
Diffstat (limited to 'module/zfs/dsl_scan.c')
-rw-r--r--module/zfs/dsl_scan.c23
1 files changed, 22 insertions, 1 deletions
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 74ef2e155..f09501793 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -542,6 +542,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
zfs_dbgmsg("new-style scrub was modified "
"by old software; restarting in txg %llu",
(longlong_t)scn->scn_restart_txg);
+ } else if (dsl_scan_resilvering(dp)) {
+ /*
+ * If a resilver is in progress and there are already
+ * errors, restart it instead of finishing this scan and
+ * then restarting it. If there haven't been any errors
+ * then remember that the incore DTL is valid.
+ */
+ if (scn->scn_phys.scn_errors > 0) {
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("resilver can't excise DTL_MISSING "
+ "when finished; restarting in txg %llu",
+ (u_longlong_t)scn->scn_restart_txg);
+ } else {
+ /* it's safe to excise DTL when finished */
+ spa->spa_scrub_started = B_TRUE;
+ }
}
}
@@ -887,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
- spa->spa_scrub_started = B_FALSE;
spa->spa_scrub_active = B_FALSE;
/*
@@ -915,6 +930,12 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
spa_errlog_rotate(spa);
/*
+ * Don't clear flag until after vdev_dtl_reassess to ensure that
+ * DTL_MISSING will get updated when possible.
+ */
+ spa->spa_scrub_started = B_FALSE;
+
+ /*
* We may have finished replacing a device.
* Let the async thread assess this and handle the detach.
*/