diff options
author | Joe Stein <[email protected]> | 2016-04-11 16:16:57 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-05-02 14:27:45 -0700 |
commit | e0ab3ab553e36595344d9cbdc240d380ad203b60 (patch) | |
tree | d9a568b7b7fce81fab94a7409b98d60b7a2c10f9 /module/zfs/spa.c | |
parent | 4cd77889b684fd0dd1a0a995b692dda3db76a9ac (diff) |
OpenZFS 6736 - ZFS per-vdev ZAPs
6736 ZFS per-vdev ZAPs
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: John Kennedy <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Don Brady <[email protected]>
Reviewed by: Dan McDonald <[email protected]>
References:
https://www.illumos.org/issues/6736
https://github.com/openzfs/openzfs/commit/215198a
Ported-by: Don Brady <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #4515
Diffstat (limited to 'module/zfs/spa.c')
-rw-r--r-- | module/zfs/spa.c | 234 |
1 files changed, 227 insertions, 7 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 63425d3d0..e24ba6f47 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1674,6 +1674,21 @@ spa_check_removed(vdev_t *vd) } } +static void +spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) +{ + uint64_t i; + + ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); + + vd->vdev_top_zap = mvd->vdev_top_zap; + vd->vdev_leaf_zap = mvd->vdev_leaf_zap; + + for (i = 0; i < vd->vdev_children; i++) { + spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); + } +} + /* * Validate the current config against the MOS config */ @@ -1778,16 +1793,25 @@ spa_config_valid(spa_t *spa, nvlist_t *config) spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_reopen(rvd); - } else if (mtvd->vdev_islog) { + } else { + if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS + * config since it's possible that the label + * does not contain the most up-to-date + * information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } + /* - * Load the slog device's state from the MOS config - * since it's possible that the label does not - * contain the most up-to-date information. + * Per-vdev ZAP info is stored exclusively in the MOS. */ - vdev_load_log_state(tvd, mtvd); - vdev_reopen(tvd); + spa_config_valid_zaps(tvd, mtvd); } } + vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); @@ -2215,6 +2239,36 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, } /* + * Count the number of per-vdev ZAPs associated with all of the vdevs in the + * vdev tree rooted in the given vd, and ensure that each ZAP is present in the + * spa's per-vdev ZAP list. + */ +static uint64_t +vdev_count_verify_zaps(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + uint64_t total = 0; + uint64_t i; + + if (vd->vdev_top_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_top_zap)); + } + if (vd->vdev_leaf_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); + } + + for (i = 0; i < vd->vdev_children; i++) { + total += vdev_count_verify_zaps(vd->vdev_child[i]); + } + + return (total); +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -2234,6 +2288,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, int parse, i; uint64_t obj; boolean_t missing_feat_write = B_FALSE; + nvlist_t *mos_config; /* * If this is an untrusted config, access the pool in read-only mode. @@ -2633,6 +2688,38 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* + * Load the per-vdev ZAP map. If we have an older pool, this will not + * be present; in this case, defer its creation to a later time to + * avoid dirtying the MOS this early / out of sync context. See + * spa_sync_config_object. + */ + + /* The sentinel is only available in the MOS config. */ + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, + &spa->spa_all_vdev_zaps); + + if (error != ENOENT && error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } else if (error == 0 && !nvlist_exists(mos_config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { + /* + * An older version of ZFS overwrote the sentinel value, so + * we have orphaned per-vdev ZAPs in the MOS. Defer their + * destruction to later; see spa_sync_config_object. + */ + spa->spa_avz_action = AVZ_ACTION_DESTROY; + /* + * We're assuming that no vdevs have had their ZAPs created + * before this. Better be sure of it. + */ + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } + nvlist_free(mos_config); + + /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. @@ -5170,6 +5257,16 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); + + /* transfer per-vdev ZAPs */ + ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); + + ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_TOP_ZAP, + vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { @@ -5211,11 +5308,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); + VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); + newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); @@ -5273,9 +5372,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); + vdev_free(vml[c]); } } + spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); @@ -6109,16 +6210,120 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, sav->sav_sync = B_FALSE; } +/* + * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. + * The all-vdev ZAP must be empty. + */ +static void +spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + uint64_t i; + + if (vd->vdev_top_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_top_zap, tx)); + } + if (vd->vdev_leaf_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_leaf_zap, tx)); + } + for (i = 0; i < vd->vdev_children; i++) { + spa_avz_build(vd->vdev_child[i], avz, tx); + } +} + static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; - if (list_is_empty(&spa->spa_config_dirty_list)) + /* + * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, + * its config may not be dirty but we still need to build per-vdev ZAPs. + * Similarly, if the pool is being assembled (e.g. after a split), we + * need to rebuild the AVZ although the config may not be dirty. + */ + if (list_is_empty(&spa->spa_config_dirty_list) && + spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || + spa->spa_all_vdev_zaps != 0); + + if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { + zap_cursor_t zc; + zap_attribute_t za; + + /* Make and build the new AVZ */ + uint64_t new_avz = zap_create(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_avz_build(spa->spa_root_vdev, new_avz, tx); + + /* Diff old AVZ with new one */ + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t vdzap = za.za_first_integer; + if (zap_lookup_int(spa->spa_meta_objset, new_avz, + vdzap) == ENOENT) { + /* + * ZAP is listed in old AVZ but not in new one; + * destroy it + */ + VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, + tx)); + } + } + + zap_cursor_fini(&zc); + + /* Destroy the old AVZ */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + + /* Replace the old AVZ in the dir obj with the new one */ + VERIFY0(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, + sizeof (new_avz), 1, &new_avz, tx)); + + spa->spa_all_vdev_zaps = new_avz; + } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { + zap_cursor_t zc; + zap_attribute_t za; + + /* Walk through the AVZ and destroy all listed ZAPs */ + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zap = za.za_first_integer; + VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); + } + + zap_cursor_fini(&zc); + + /* Destroy and unlink the AVZ itself */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); + spa->spa_all_vdev_zaps = 0; + } + + if (spa->spa_all_vdev_zaps == 0) { + spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_VDEV_ZAP_MAP, tx); + } + spa->spa_avz_action = AVZ_ACTION_NONE; + + /* Create ZAPs for vdevs that don't have them. */ + vdev_construct_zaps(spa->spa_root_vdev, tx); + config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); @@ -6509,6 +6714,21 @@ spa_sync(spa_t *spa, uint64_t txg) } while (dmu_objset_is_dirty(mos, txg)); + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } + /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. |