summaryrefslogtreecommitdiffstats
path: root/module/zfs/spa.c
diff options
context:
space:
mode:
authorJoe Stein <[email protected]>2016-04-11 16:16:57 -0400
committerBrian Behlendorf <[email protected]>2016-05-02 14:27:45 -0700
commite0ab3ab553e36595344d9cbdc240d380ad203b60 (patch)
treed9a568b7b7fce81fab94a7409b98d60b7a2c10f9 /module/zfs/spa.c
parent4cd77889b684fd0dd1a0a995b692dda3db76a9ac (diff)
OpenZFS 6736 - ZFS per-vdev ZAPs
6736 ZFS per-vdev ZAPs Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: John Kennedy <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Don Brady <[email protected]> Reviewed by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/6736 https://github.com/openzfs/openzfs/commit/215198a Ported-by: Don Brady <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #4515
Diffstat (limited to 'module/zfs/spa.c')
-rw-r--r--module/zfs/spa.c234
1 files changed, 227 insertions, 7 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 63425d3d0..e24ba6f47 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1674,6 +1674,21 @@ spa_check_removed(vdev_t *vd)
}
}
+static void
+spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
+{
+ uint64_t i;
+
+ ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
+
+ vd->vdev_top_zap = mvd->vdev_top_zap;
+ vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
+
+ for (i = 0; i < vd->vdev_children; i++) {
+ spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
+ }
+}
+
/*
* Validate the current config against the MOS config
*/
@@ -1778,16 +1793,25 @@ spa_config_valid(spa_t *spa, nvlist_t *config)
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_reopen(rvd);
- } else if (mtvd->vdev_islog) {
+ } else {
+ if (mtvd->vdev_islog) {
+ /*
+ * Load the slog device's state from the MOS
+ * config since it's possible that the label
+ * does not contain the most up-to-date
+ * information.
+ */
+ vdev_load_log_state(tvd, mtvd);
+ vdev_reopen(tvd);
+ }
+
/*
- * Load the slog device's state from the MOS config
- * since it's possible that the label does not
- * contain the most up-to-date information.
+ * Per-vdev ZAP info is stored exclusively in the MOS.
*/
- vdev_load_log_state(tvd, mtvd);
- vdev_reopen(tvd);
+ spa_config_valid_zaps(tvd, mtvd);
}
}
+
vdev_free(mrvd);
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -2215,6 +2239,36 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
}
/*
+ * Count the number of per-vdev ZAPs associated with all of the vdevs in the
+ * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
+ * spa's per-vdev ZAP list.
+ */
+static uint64_t
+vdev_count_verify_zaps(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t total = 0;
+ uint64_t i;
+
+ if (vd->vdev_top_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_top_zap));
+ }
+ if (vd->vdev_leaf_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
+ }
+
+ for (i = 0; i < vd->vdev_children; i++) {
+ total += vdev_count_verify_zaps(vd->vdev_child[i]);
+ }
+
+ return (total);
+}
+
+/*
* Load an existing storage pool, using the pool's builtin spa_config as a
* source of configuration information.
*/
@@ -2234,6 +2288,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
int parse, i;
uint64_t obj;
boolean_t missing_feat_write = B_FALSE;
+ nvlist_t *mos_config;
/*
* If this is an untrusted config, access the pool in read-only mode.
@@ -2633,6 +2688,38 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
+ * Load the per-vdev ZAP map. If we have an older pool, this will not
+ * be present; in this case, defer its creation to a later time to
+ * avoid dirtying the MOS this early / out of sync context. See
+ * spa_sync_config_object.
+ */
+
+ /* The sentinel is only available in the MOS config. */
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
+ &spa->spa_all_vdev_zaps);
+
+ if (error != ENOENT && error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ } else if (error == 0 && !nvlist_exists(mos_config,
+ ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
+ /*
+ * An older version of ZFS overwrote the sentinel value, so
+ * we have orphaned per-vdev ZAPs in the MOS. Defer their
+ * destruction to later; see spa_sync_config_object.
+ */
+ spa->spa_avz_action = AVZ_ACTION_DESTROY;
+ /*
+ * We're assuming that no vdevs have had their ZAPs created
+ * before this. Better be sure of it.
+ */
+ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+ }
+ nvlist_free(mos_config);
+
+ /*
* If we're assembling the pool from the split-off vdevs of
* an existing pool, we don't want to attach the spares & cache
* devices.
@@ -5170,6 +5257,16 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
vml[c]->vdev_top->vdev_asize) == 0);
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
vml[c]->vdev_top->vdev_ashift) == 0);
+
+ /* transfer per-vdev ZAPs */
+ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
+ VERIFY0(nvlist_add_uint64(child[c],
+ ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
+
+ ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
+ VERIFY0(nvlist_add_uint64(child[c],
+ ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ vml[c]->vdev_parent->vdev_top_zap));
}
if (error != 0) {
@@ -5211,11 +5308,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa->spa_config_txg) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
spa_generate_guid(NULL)) == 0);
+ VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
/* add the new pool to the namespace */
newspa = spa_add(newname, config, altroot);
+ newspa->spa_avz_action = AVZ_ACTION_REBUILD;
newspa->spa_config_txg = spa->spa_config_txg;
spa_set_log_state(newspa, SPA_LOG_CLEAR);
@@ -5273,9 +5372,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
if (error == 0)
spa_history_log_internal(spa, "detach", tx,
"vdev=%s", vml[c]->vdev_path);
+
vdev_free(vml[c]);
}
}
+ spa->spa_avz_action = AVZ_ACTION_REBUILD;
vdev_config_dirty(spa->spa_root_vdev);
spa->spa_config_splitting = NULL;
nvlist_free(nvl);
@@ -6109,16 +6210,120 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
sav->sav_sync = B_FALSE;
}
+/*
+ * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
+ * The all-vdev ZAP must be empty.
+ */
+static void
+spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t i;
+
+ if (vd->vdev_top_zap != 0) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_top_zap, tx));
+ }
+ if (vd->vdev_leaf_zap != 0) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_leaf_zap, tx));
+ }
+ for (i = 0; i < vd->vdev_children; i++) {
+ spa_avz_build(vd->vdev_child[i], avz, tx);
+ }
+}
+
static void
spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
{
nvlist_t *config;
- if (list_is_empty(&spa->spa_config_dirty_list))
+ /*
+ * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
+ * its config may not be dirty but we still need to build per-vdev ZAPs.
+ * Similarly, if the pool is being assembled (e.g. after a split), we
+ * need to rebuild the AVZ although the config may not be dirty.
+ */
+ if (list_is_empty(&spa->spa_config_dirty_list) &&
+ spa->spa_avz_action == AVZ_ACTION_NONE)
return;
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
+ spa->spa_all_vdev_zaps != 0);
+
+ if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /* Make and build the new AVZ */
+ uint64_t new_avz = zap_create(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ spa_avz_build(spa->spa_root_vdev, new_avz, tx);
+
+ /* Diff old AVZ with new one */
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t vdzap = za.za_first_integer;
+ if (zap_lookup_int(spa->spa_meta_objset, new_avz,
+ vdzap) == ENOENT) {
+ /*
+ * ZAP is listed in old AVZ but not in new one;
+ * destroy it
+ */
+ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
+ tx));
+ }
+ }
+
+ zap_cursor_fini(&zc);
+
+ /* Destroy the old AVZ */
+ VERIFY0(zap_destroy(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, tx));
+
+ /* Replace the old AVZ in the dir obj with the new one */
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
+ sizeof (new_avz), 1, &new_avz, tx));
+
+ spa->spa_all_vdev_zaps = new_avz;
+ } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /* Walk through the AVZ and destroy all listed ZAPs */
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zap = za.za_first_integer;
+ VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
+ }
+
+ zap_cursor_fini(&zc);
+
+ /* Destroy and unlink the AVZ itself */
+ VERIFY0(zap_destroy(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, tx));
+ VERIFY0(zap_remove(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
+ spa->spa_all_vdev_zaps = 0;
+ }
+
+ if (spa->spa_all_vdev_zaps == 0) {
+ spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_VDEV_ZAP_MAP, tx);
+ }
+ spa->spa_avz_action = AVZ_ACTION_NONE;
+
+ /* Create ZAPs for vdevs that don't have them. */
+ vdev_construct_zaps(spa->spa_root_vdev, tx);
+
config = spa_config_generate(spa, spa->spa_root_vdev,
dmu_tx_get_txg(tx), B_FALSE);
@@ -6509,6 +6714,21 @@ spa_sync(spa_t *spa, uint64_t txg)
} while (dmu_objset_is_dirty(mos, txg));
+ if (!list_is_empty(&spa->spa_config_dirty_list)) {
+ /*
+ * Make sure that the number of ZAPs for all the vdevs matches
+ * the number of ZAPs in the per-vdev ZAP list. This only gets
+ * called if the config is dirty; otherwise there may be
+ * outstanding AVZ operations that weren't completed in
+ * spa_sync_config_object.
+ */
+ uint64_t all_vdev_zap_entry_count;
+ ASSERT0(zap_count(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+ ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+ all_vdev_zap_entry_count);
+ }
+
/*
* Rewrite the vdev configuration (which includes the uberblock)
* to commit the transaction group.