aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorGeorge Wilson <[email protected]>2018-12-19 09:20:39 -0700
committerBrian Behlendorf <[email protected]>2019-01-07 11:03:08 -0800
commitc10d37dd9f5d56e19330add732f1a713250d24f7 (patch)
treea43f6b7c1592612fe4277bcaf5d99c02543523b8 /module
parent619f09769393d4e0cbaa5f662362138e1c699159 (diff)
zfs initialize performance enhancements
PROBLEM ======== When invoking "zpool initialize" on a pool the command will create a thread to initialize each disk. Unfortunately, it does this serially across many transaction groups which can result in commands taking a long time to return to the user and may appear hung. The same thing is true when trying to suspend/cancel the operation. SOLUTION ========= This change refactors the way we invoke the initialize interface to ensure we can start or stop the intialization in just a few transaction groups. When stopping or cancelling a vdev initialization perform it in two phases. First signal each vdev initialization thread that it should exit, then after all threads have been signaled wait for them to exit. On a pool with 40 leaf vdevs this reduces the vdev initialize stop/cancel time from ~10 minutes to under a second. The reason for this is spa_vdev_initialize() no longer needs to wait on multiple full TXGs per leaf vdev being stopped. This commit additionally adds some missing checks for the passed "initialize_vdevs" input nvlist. The contents of the user provided input "initialize_vdevs" nvlist must be validated to ensure all values are uint64s. This is done in zfs_ioc_pool_initialize() in order to keep all of these checks in a single location. Updated the innvl and outnvl comments to match the formatting used for all other new sytle ioctls. Reviewed by: Matt Ahrens <[email protected]> Reviewed-by: loli10K <[email protected]> Reviewed-by: Tim Chase <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Signed-off-by: George Wilson <[email protected]> Closes #8230
Diffstat (limited to 'module')
-rw-r--r--module/zfs/spa.c90
-rw-r--r--module/zfs/spa_misc.c3
-rw-r--r--module/zfs/vdev.c1
-rw-r--r--module/zfs/vdev_initialize.c83
-rw-r--r--module/zfs/vdev_removal.c2
-rw-r--r--module/zfs/zfs_ioctl.c57
6 files changed, 160 insertions, 76 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 622be75f9..c4ff5002b 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -6381,32 +6381,24 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (error);
}
-int
-spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+static int
+spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+ list_t *vd_list)
{
- /*
- * We hold the namespace lock through the whole function
- * to prevent any changes to the pool while we're starting or
- * stopping initialization. The config and state locks are held so that
- * we can properly assess the vdev state before we commit to
- * the initializing operation.
- */
- mutex_enter(&spa_namespace_lock);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
/* Look up vdev and ensure it's a leaf. */
vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd == NULL || vd->vdev_detached) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ENODEV));
} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EINVAL));
} else if (!vdev_writeable(vd)) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EROFS));
}
mutex_enter(&vd->vdev_initialize_lock);
@@ -6423,18 +6415,15 @@ spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
(vd->vdev_initialize_thread != NULL ||
vd->vdev_top->vdev_removing)) {
mutex_exit(&vd->vdev_initialize_lock);
- mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EBUSY));
} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
(vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
mutex_exit(&vd->vdev_initialize_lock);
- mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ESRCH));
} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
mutex_exit(&vd->vdev_initialize_lock);
- mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ESRCH));
}
@@ -6443,23 +6432,65 @@ spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
vdev_initialize(vd);
break;
case POOL_INITIALIZE_CANCEL:
- vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
break;
case POOL_INITIALIZE_SUSPEND:
- vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
break;
default:
panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
}
mutex_exit(&vd->vdev_initialize_lock);
+ return (0);
+}
+
+int
+spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
+ nvlist_t *vdev_errlist)
+{
+ int total_errors = 0;
+ list_t vd_list;
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
+ &vd_list);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+
+ /* Wait for all initialize threads to stop. */
+ vdev_initialize_stop_wait(spa, &vd_list);
+
/* Sync out the initializing state */
txg_wait_synced(spa->spa_dsl_pool, 0);
mutex_exit(&spa_namespace_lock);
- return (0);
-}
+ list_destroy(&vd_list);
+ return (total_errors);
+}
/*
* Split a set of devices from their mirrors, and create a new pool from them.
@@ -6669,18 +6700,25 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
+ /*
+ * Temporarily stop the initializing activity. We set the state to
+ * ACTIVE so that we know to resume the initializing once the split
+ * has completed.
+ */
+ list_t vd_list;
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
for (c = 0; c < children; c++) {
if (vml[c] != NULL) {
- /*
- * Temporarily stop the initializing activity. We set
- * the state to ACTIVE so that we know to resume
- * the initializing once the split has completed.
- */
mutex_enter(&vml[c]->vdev_initialize_lock);
- vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+ vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE,
+ &vd_list);
mutex_exit(&vml[c]->vdev_initialize_lock);
}
}
+ vdev_initialize_stop_wait(spa, &vd_list);
+ list_destroy(&vd_list);
newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index dfac92d45..877f312b1 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1197,7 +1197,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
if (vd->vdev_ops->vdev_op_leaf) {
mutex_enter(&vd->vdev_initialize_lock);
- vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
+ NULL);
mutex_exit(&vd->vdev_initialize_lock);
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index f808f8ee7..26ef5b4c5 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -530,6 +530,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
+ list_link_init(&vd->vdev_initialize_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index fcd2c76f9..939a5e938 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -702,18 +702,51 @@ vdev_initialize(vdev_t *vd)
}
/*
- * Stop initializng a device, with the resultant initialing state being
- * tgt_state. Blocks until the initializing thread has exited.
- * Caller must hold vdev_initialize_lock and must not be writing to the spa
- * config, as the initializing thread may try to enter the config as a reader
- * before exiting.
+ * Wait for the initialize thread to be terminated (cancelled or stopped).
+ */
+static void
+vdev_initialize_stop_wait_impl(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+
+ while (vd->vdev_initialize_thread != NULL)
+ cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+/*
+ * Wait for vdev initialize threads which were either to cleanly exit.
*/
void
-vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
{
- ASSERTV(spa_t *spa = vd->vdev_spa);
- ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+ vdev_t *vd;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ while ((vd = list_remove_head(vd_list)) != NULL) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop_wait_impl(vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+}
+/*
+ * Stop initializing a device, with the resultant initialing state being
+ * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when
+ * a list_t is provided the stopping vdev is inserted in to the list. Callers
+ * are then required to call vdev_initialize_stop_wait() to block for all the
+ * initialization threads to exit. The caller must hold vdev_initialize_lock
+ * and must not be writing to the spa config, as the initializing thread may
+ * try to enter the config as a reader before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
+ list_t *vd_list)
+{
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
ASSERT(vd->vdev_ops->vdev_op_leaf);
ASSERT(vdev_is_concrete(vd));
@@ -729,25 +762,29 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
vdev_initialize_change_state(vd, tgt_state);
vd->vdev_initialize_exit_wanted = B_TRUE;
- while (vd->vdev_initialize_thread != NULL)
- cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
- ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
- vd->vdev_initialize_exit_wanted = B_FALSE;
+ if (vd_list == NULL) {
+ vdev_initialize_stop_wait_impl(vd);
+ } else {
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ list_insert_tail(vd_list, vd);
+ }
}
static void
-vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
+ list_t *vd_list)
{
if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
mutex_enter(&vd->vdev_initialize_lock);
- vdev_initialize_stop(vd, tgt_state);
+ vdev_initialize_stop(vd, tgt_state, vd_list);
mutex_exit(&vd->vdev_initialize_lock);
return;
}
for (uint64_t i = 0; i < vd->vdev_children; i++) {
- vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+ vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
+ vd_list);
}
}
@@ -758,12 +795,23 @@ vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
void
vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
{
- vdev_initialize_stop_all_impl(vd, tgt_state);
+ spa_t *spa = vd->vdev_spa;
+ list_t vd_list;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
+ vdev_initialize_stop_wait(spa, &vd_list);
if (vd->vdev_spa->spa_sync_on) {
/* Make sure that our state has been synced to disk */
txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
}
+
+ list_destroy(&vd_list);
}
void
@@ -808,9 +856,10 @@ vdev_initialize_restart(vdev_t *vd)
#if defined(_KERNEL)
EXPORT_SYMBOL(vdev_initialize_restart);
EXPORT_SYMBOL(vdev_xlate);
-EXPORT_SYMBOL(vdev_initialize_stop_all);
EXPORT_SYMBOL(vdev_initialize);
EXPORT_SYMBOL(vdev_initialize_stop);
+EXPORT_SYMBOL(vdev_initialize_stop_all);
+EXPORT_SYMBOL(vdev_initialize_stop_wait);
/* CSTYLED */
module_param(zfs_initialize_value, ulong, 0644);
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index d0824aa84..1896e824f 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1899,7 +1899,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
/* Stop initializing */
- (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
*txg = spa_vdev_config_enter(spa);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 3c36502d8..0dfa01684 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3846,73 +3846,68 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
/*
* innvl: {
- * vdevs: {
- * guid 1, guid 2, ...
+ * "initialize_command" -> POOL_INITIALIZE_{CANCEL|DO|SUSPEND} (uint64)
+ * "initialize_vdevs": { -> guids to initialize (nvlist)
+ * "vdev_path_1": vdev_guid_1, (uint64),
+ * "vdev_path_2": vdev_guid_2, (uint64),
+ * ...
* },
- * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
* }
*
* outnvl: {
- * [func: EINVAL (if provided command type didn't make sense)],
- * [vdevs: {
- * guid1: errno, (see function body for possible errnos)
+ * "initialize_vdevs": { -> initialization errors (nvlist)
+ * "vdev_path_1": errno, see function body for possible errnos (uint64)
+ * "vdev_path_2": errno, ... (uint64)
* ...
- * }]
+ * }
* }
*
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
*/
static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
- {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
{ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0}
};
static int
zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
- spa_t *spa;
- int error;
-
- error = spa_open(poolname, &spa, FTAG);
- if (error != 0)
- return (error);
-
uint64_t cmd_type;
if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
&cmd_type) != 0) {
- spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
+
if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
cmd_type == POOL_INITIALIZE_DO ||
cmd_type == POOL_INITIALIZE_SUSPEND)) {
- spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
nvlist_t *vdev_guids;
if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
&vdev_guids) != 0) {
- spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
- nvlist_t *vdev_errlist = fnvlist_alloc();
- int total_errors = 0;
-
for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
- uint64_t vdev_guid = fnvpair_value_uint64(pair);
-
- error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
- if (error != 0) {
- char guid_as_str[MAXNAMELEN];
-
- (void) snprintf(guid_as_str, sizeof (guid_as_str),
- "%llu", (unsigned long long)vdev_guid);
- fnvlist_add_int64(vdev_errlist, guid_as_str, error);
- total_errors++;
+ uint64_t vdev_guid;
+ if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+ return (SET_ERROR(EINVAL));
}
}
+
+ spa_t *spa;
+ int error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type,
+ vdev_errlist);
+
if (fnvlist_size(vdev_errlist) > 0) {
fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
vdev_errlist);