diff options
author | Don Brady <[email protected]> | 2018-09-05 19:33:36 -0600 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-09-05 18:33:36 -0700 |
commit | cc99f275a28c43fe450a66a7544f73c4935f7361 (patch) | |
tree | f867e1d2cbb550a047c0f87986831252c41a2fd9 /module/zfs/vdev.c | |
parent | cfa37548ebc880580782b245f2d233ed540e7a01 (diff) |
Pool allocation classes
Allocation Classes add the ability to have allocation classes in a
pool that are dedicated to serving specific block categories, such
as DDT data, metadata, and small file blocks. A pool can opt-in to
this feature by adding a 'special' or 'dedup' top-level VDEV.
Reviewed by: Pavel Zakharov <[email protected]>
Reviewed-by: Richard Laager <[email protected]>
Reviewed-by: Alek Pinchuk <[email protected]>
Reviewed-by: HÃ¥kan Johansson <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
Reviewed-by: DHE <[email protected]>
Reviewed-by: Richard Elling <[email protected]>
Reviewed-by: Gregor Kopka <[email protected]>
Reviewed-by: Kash Pande <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Closes #5182
Diffstat (limited to 'module/zfs/vdev.c')
-rw-r--r-- | module/zfs/vdev.c | 211 |
1 files changed, 183 insertions, 28 deletions
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f5c259bd4..dfe444368 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <[email protected]> * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ #include <sys/zfs_context.h> @@ -205,6 +206,25 @@ vdev_getops(const char *type) } /* + * Derive the enumerated alloction bias from string input. + * String origin is either the per-vdev zap or zpool(1M). + */ +static vdev_alloc_bias_t +vdev_derive_alloc_bias(const char *bias) +{ + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + + if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) + alloc_bias = VDEV_BIAS_LOG; + else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + alloc_bias = VDEV_BIAS_SPECIAL; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + alloc_bias = VDEV_BIAS_DEDUP; + + return (alloc_bias); +} + +/* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ @@ -528,6 +548,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vdev_indirect_config_t *vic; char *tmp = NULL; int rc; + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -614,11 +636,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } ASSERT(nparity != -1ULL); + /* + * If creating a top-level vdev, check for allocation classes input + */ + if (top_level && alloctype == VDEV_ALLOC_ADD) { + char *bias; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + + /* spa_vdev_add() expects feature to be enabled */ + if (spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES)) { + return (SET_ERROR(ENOTSUP)); + } + } + } + vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) + vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -687,7 +730,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && + if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); @@ -703,14 +746,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ASSERT0(vd->vdev_top_zap); } - if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { + if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); - vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd, - spa->spa_alloc_count); + /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && @@ -952,6 +993,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; + tvd->vdev_alloc_bias = svd->vdev_alloc_bias; + svd->vdev_alloc_bias = VDEV_BIAS_NONE; + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; @@ -1114,6 +1158,55 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +static void +vdev_metaslab_group_create(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + /* + * metaslab_group_create was delayed until allocation bias was available + */ + if (vd->vdev_mg == NULL) { + metaslab_class_t *mc; + + if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = VDEV_BIAS_LOG; + + ASSERT3U(vd->vdev_islog, ==, + (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + mc = spa_log_class(spa); + break; + case VDEV_BIAS_SPECIAL: + mc = spa_special_class(spa); + break; + case VDEV_BIAS_DEDUP: + mc = spa_dedup_class(spa); + break; + default: + mc = spa_normal_class(spa); + } + + vd->vdev_mg = metaslab_group_create(mc, vd, + spa->spa_alloc_count); + + /* + * The spa ashift values currently only reflect the + * general vdev classes. Class destination is late + * binding so ashift checking had to wait until now + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + } +} + int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { @@ -1124,6 +1217,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; + boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); @@ -1139,7 +1233,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); - if (oldc != 0) { + if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } @@ -1165,6 +1259,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } +#ifndef _KERNEL + /* + * To accomodate zdb_leak_init() fake indirect + * metaslabs, we allocate a metaslab group for + * indirect vdevs which normally don't have one. + */ + if (vd->vdev_mg == NULL) { + ASSERT0(vdev_is_concrete(vd)); + vdev_metaslab_group_create(vd); + } +#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1182,8 +1287,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ - if (oldc == 0 && !vd->vdev_removing) + if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -1673,9 +1779,13 @@ vdev_open(vdev_t *vd) /* * Track the min and max ashift values for normal data devices. + * + * DJB - TBD these should perhaps be tracked per allocation class + * (e.g. spa_min_ashift is used to round up post compression buffers) */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - !vd->vdev_islog && vd->vdev_aux == NULL) { + vd->vdev_alloc_bias == VDEV_BIAS_NONE && + vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) @@ -2561,6 +2671,30 @@ vdev_dtl_load(vdev_t *vd) return (error); } +static void +vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *string; + + ASSERT(alloc_bias != VDEV_BIAS_NONE); + + string = + (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; + + ASSERT(string != NULL); + VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(string) + 1, string, tx)); + + if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { + spa_activate_allocation_classes(spa, tx); + } +} + void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { @@ -2597,8 +2731,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) + vdev_zap_allocation_data(vd, tx); } } + for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } @@ -2802,9 +2939,26 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); /* + * On spa_load path, grab the allocation bias from our zap + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + char bias_str[64]; + + if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), + bias_str) == 0) { + ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); + vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + } + } + + /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -2999,6 +3153,7 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg) metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } @@ -3673,7 +3828,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { - vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + vs->vs_fragmentation = (vd->vdev_mg != NULL) ? + vd->vdev_mg->mg_fragmentation : 0; } } @@ -3878,19 +4034,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } +int64_t +vdev_deflated_space(vdev_t *vd, int64_t space) +{ + ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); + + return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); +} + /* - * Update the in-core space usage stats for this vdev, its metaslab class, - * and the root vdev. + * Update the in-core space usage stats for this vdev and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { - int64_t dspace_delta = space_delta; + int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - metaslab_group_t *mg = vd->vdev_mg; - metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -3900,10 +4062,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ - ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); - ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); - dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; @@ -3911,21 +4070,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (mc == spa_normal_class(spa)) { + /* every class but log contributes to root space stats */ + if (vd->vdev_mg != NULL && !vd->vdev_islog) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } - - if (mc != NULL) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - metaslab_class_space_update(mc, - alloc_delta, defer_delta, space_delta, dspace_delta); - } + /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* @@ -4354,7 +4507,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } |