aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorTom Caputi <[email protected]>2018-10-19 00:06:18 -0400
committerBrian Behlendorf <[email protected]>2018-10-18 21:06:18 -0700
commit80a91e7469669e2a5da5873b8f09a752f7869062 (patch)
treeef5a4462892becccb939b2cd42a54ed580f5894f /module/zfs
parent9f438c5f948c0072f16431407a373ead34fabf6e (diff)
Defer new resilvers until the current one ends
Currently, if a resilver is triggered for any reason while an existing one is running, zfs will immediately restart the existing resilver from the beginning to include the new drive. This causes problems for system administrators when a drive fails while another is already resilvering. In this case, the optimal thing to do to reduce risk of data loss is to wait for the current resilver to end before immediately replacing the second failed drive, which allows the system to operate with two incomplete drives for the minimum amount of time. This patch introduces the resilver_defer feature that essentially does this for the admin without forcing them to wait and monitor the resilver manually. The change requires an on-disk feature since we must mark drives that are part of a deferred resilver in the vdev config to ensure that we do not assume they are done resilvering when an existing resilver completes. Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: John Kennedy <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: @mmaybee Signed-off-by: Tom Caputi <[email protected]> Closes #7732
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/dsl_scan.c103
-rw-r--r--module/zfs/spa.c16
-rw-r--r--module/zfs/vdev.c35
-rw-r--r--module/zfs/vdev_label.c6
4 files changed, 150 insertions, 10 deletions
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index b84c2aa45..aff99f275 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -175,6 +175,8 @@ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
unsigned long zfs_async_block_max_blocks = 100000;
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
/*
* We wait a few txgs after importing a pool to begin scanning so that
* the import / mounting code isn't held up by scrub / resilver IO.
@@ -720,6 +722,11 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
+ if (func == POOL_SCAN_RESILVER) {
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ return (0);
+ }
+
if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -736,6 +743,41 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
+/*
+ * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+static boolean_t
+dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+ boolean_t resilver_needed = B_FALSE;
+ spa_t *spa = vd->vdev_spa;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ resilver_needed |=
+ dsl_scan_clear_deferred(vd->vdev_child[c], tx);
+ }
+
+ if (vd == spa->spa_root_vdev &&
+ spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+ vdev_config_dirty(vd);
+ spa->spa_resilver_deferred = B_FALSE;
+ return (resilver_needed);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (resilver_needed);
+
+ if (vd->vdev_resilver_deferred)
+ vd->vdev_resilver_deferred = B_FALSE;
+
+ return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+ vdev_resilver_needed(vd, NULL, NULL));
+}
+
/* ARGSUSED */
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -835,6 +877,25 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+ /*
+ * Clear any deferred_resilver flags in the config.
+ * If there are drives that need resilvering, kick
+ * off an asynchronous request to start resilver.
+ * dsl_scan_clear_deferred() may update the config
+ * before the resilver can restart. In the event of
+ * a crash during this period, the spa loading code
+ * will find the drives that need to be resilvered
+ * when the machine reboots and start the resilver then.
+ */
+ boolean_t resilver_needed =
+ dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
+ if (resilver_needed) {
+ spa_history_log_internal(spa,
+ "starting deferred resilver", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
}
scn->scn_phys.scn_end_time = gethrestime_sec();
@@ -2967,6 +3028,26 @@ dsl_scan_active(dsl_scan_t *scn)
}
static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+ boolean_t need_resilver = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ need_resilver |=
+ dsl_scan_check_deferred(vd->vdev_child[c]);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (need_resilver);
+
+ if (!vd->vdev_resilver_deferred)
+ need_resilver = B_TRUE;
+
+ return (need_resilver);
+}
+
+static boolean_t
dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
@@ -3013,6 +3094,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
return (B_FALSE);
+ /*
+ * Check that this top-level vdev has a device under it which
+ * is resilvering and is not deferred.
+ */
+ if (!dsl_scan_check_deferred(vd))
+ return (B_FALSE);
+
return (B_TRUE);
}
@@ -3173,12 +3261,19 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
spa_t *spa = dp->dp_spa;
state_sync_type_t sync_type = SYNC_OPTIONAL;
+ if (spa->spa_resilver_deferred &&
+ !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+ spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
/*
* Check for scn_restart_txg before checking spa_load_state, so
* that we can restart an old-style scan while the pool is being
- * imported (see dsl_scan_init).
+ * imported (see dsl_scan_init). We also restart scans if there
+ * is a deferred resilver and the user has manually disabled
+ * deferred resilvers via the tunable.
*/
- if (dsl_scan_restarting(scn, tx)) {
+ if (dsl_scan_restarting(scn, tx) ||
+ (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_done(scn, B_FALSE, tx);
if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
@@ -4000,4 +4095,8 @@ MODULE_PARM_DESC(zfs_scan_strict_mem_lim,
module_param(zfs_scan_fill_weight, int, 0644);
MODULE_PARM_DESC(zfs_scan_fill_weight,
"Tunable to adjust bias towards more filled segments during scans");
+
+module_param(zfs_resilver_disable_defer, int, 0644);
+MODULE_PARM_DESC(zfs_resilver_disable_defer,
+ "Process all resilvers immediately");
#endif
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index fdce49c40..3785981b7 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -6059,9 +6059,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
/*
* Schedule the resilver to restart in the future. We do this to
* ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
+ * respective datasets. We do not do this if resilvers have been
+ * deferred.
*/
- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, newvd);
+ else
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6933,6 +6938,7 @@ static void
spa_async_thread(void *arg)
{
spa_t *spa = (spa_t *)arg;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
int tasks;
ASSERT(spa->spa_sync_on);
@@ -7008,8 +7014,10 @@ spa_async_thread(void *arg)
/*
* Kick off a resilver.
*/
- if (tasks & SPA_ASYNC_RESILVER)
- dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ if (tasks & SPA_ASYNC_RESILVER &&
+ (!dsl_scan_resilvering(dp) ||
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
+ dsl_resilver_restart(dp, 0);
/*
* Let the world know that we're done.
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 2c95626c4..1521acc40 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -790,6 +790,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
+ if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+
/*
* In general, when importing a pool we want to ignore the
* persistent fault state, as the diagnosis made on another
@@ -1798,8 +1801,13 @@ vdev_open(vdev_t *vd)
* since this would just restart the scrub we are already doing.
*/
if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
- vdev_resilver_needed(vd, NULL, NULL))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ vdev_resilver_needed(vd, NULL, NULL)) {
+ if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+ else
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
return (0);
}
@@ -2488,6 +2496,9 @@ vdev_dtl_should_excise(vdev_t *vd)
if (vd->vdev_state < VDEV_STATE_DEGRADED)
return (B_FALSE);
+ if (vd->vdev_resilver_deferred)
+ return (B_FALSE);
+
if (vd->vdev_resilver_txg == 0 ||
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
@@ -3618,8 +3629,14 @@ vdev_clear(spa_t *spa, vdev_t *vd)
if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
- if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) {
+ if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ spa_feature_is_enabled(spa,
+ SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+ else
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
}
@@ -3840,6 +3857,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0);
@@ -4578,6 +4597,14 @@ vdev_deadman(vdev_t *vd, char *tag)
}
}
+void
+vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
+{
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vd->vdev_resilver_deferred = B_TRUE;
+ spa->spa_resilver_deferred = B_TRUE;
+}
+
#if defined(_KERNEL)
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 439ab7438..7e86e3a8b 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -524,6 +524,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
vd->vdev_top_zap);
}
+
+ if (vd->vdev_resilver_deferred) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(spa->spa_resilver_deferred);
+ fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
+ }
}
if (getstats) {