diff options
author | Don Brady <[email protected]> | 2018-09-05 19:33:36 -0600 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-09-05 18:33:36 -0700 |
commit | cc99f275a28c43fe450a66a7544f73c4935f7361 (patch) | |
tree | f867e1d2cbb550a047c0f87986831252c41a2fd9 /module/zfs | |
parent | cfa37548ebc880580782b245f2d233ed540e7a01 (diff) |
Pool allocation classes
Allocation Classes add the ability to have allocation classes in a
pool that are dedicated to serving specific block categories, such
as DDT data, metadata, and small file blocks. A pool can opt-in to
this feature by adding a 'special' or 'dedup' top-level VDEV.
Reviewed by: Pavel Zakharov <[email protected]>
Reviewed-by: Richard Laager <[email protected]>
Reviewed-by: Alek Pinchuk <[email protected]>
Reviewed-by: HÃ¥kan Johansson <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
Reviewed-by: DHE <[email protected]>
Reviewed-by: Richard Elling <[email protected]>
Reviewed-by: Gregor Kopka <[email protected]>
Reviewed-by: Kash Pande <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Closes #5182
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dmu.c | 2 | ||||
-rw-r--r-- | module/zfs/dmu_objset.c | 20 | ||||
-rw-r--r-- | module/zfs/metaslab.c | 150 | ||||
-rw-r--r-- | module/zfs/spa.c | 67 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 109 | ||||
-rw-r--r-- | module/zfs/vdev.c | 211 | ||||
-rw-r--r-- | module/zfs/vdev_label.c | 24 | ||||
-rw-r--r-- | module/zfs/vdev_removal.c | 38 | ||||
-rw-r--r-- | module/zfs/zfs_debug.c | 6 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 9 | ||||
-rw-r--r-- | module/zfs/zio.c | 98 |
11 files changed, 601 insertions, 133 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 88b574d4a..8779eb358 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2281,6 +2281,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) bzero(zp->zp_salt, ZIO_DATA_SALT_LEN); bzero(zp->zp_iv, ZIO_DATA_IV_LEN); bzero(zp->zp_mac, ZIO_DATA_MAC_LEN); + zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? + os->os_zpl_special_smallblock : 0; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 9adda320f..3c9a817f7 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -315,6 +315,20 @@ dnodesize_changed_cb(void *arg, uint64_t newval) } static void +smallblk_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE); + ASSERT(ISP2(newval)); + + os->os_zpl_special_smallblock = newval; +} + +static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; @@ -556,6 +570,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_DNODESIZE), dnodesize_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_SPECIAL_SMALL_BLOCKS), + smallblk_changed_cb, os); + } } if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index f4c01497f..ac361abb6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -20,8 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ #include <sys/zfs_context.h> @@ -300,7 +301,7 @@ metaslab_class_validate(metaslab_class_t *mc) return (0); } -void +static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { @@ -337,7 +338,8 @@ metaslab_class_get_dspace(metaslab_class_t *mc) void metaslab_class_histogram_verify(metaslab_class_t *mc) { - vdev_t *rvd = mc->mc_spa->spa_root_vdev; + spa_t *spa = mc->mc_spa; + vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; @@ -834,7 +836,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp->ms_sm == NULL) + /* skip if not active or not a member */ + if (msp->ms_sm == NULL || msp->ms_group != mg) continue; for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) @@ -967,12 +970,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg) if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; + if (msp->ms_group != mg) + continue; valid_ms++; fragmentation += msp->ms_fragmentation; } - if (valid_ms <= vd->vdev_ms_count / 2) + if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; @@ -1003,7 +1008,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * groups to select from. Otherwise, we always consider it eligible * for allocations. */ - if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + if ((mc != spa_normal_class(spa) && + mc != spa_special_class(spa) && + mc != spa_dedup_class(spa)) || + mc->mc_groups <= 1) return (B_TRUE); /* @@ -1466,12 +1474,26 @@ metaslab_unload(metaslab_t *msp) msp->ms_max_size = 0; } +static void +metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta) +{ + vdev_space_update(vd, alloc_delta, defer_delta, space_delta); + + ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, + vdev_deflated_space(vd, space_delta)); +} + int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; - objset_t *mos = vd->vdev_spa->spa_meta_objset; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; @@ -1528,8 +1550,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, /* * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space map object then load the its space - * map so that can verify frees. + * that has an allocated space map object then load the space map + * so that we can verify frees. */ if (metaslab_debug_load && ms->ms_sm != NULL) { mutex_enter(&ms->ms_lock); @@ -1551,16 +1573,19 @@ void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); - vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), - 0, -msp->ms_size); + metaslab_space_update(vd, mg->mg_class, + -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); + space_map_close(msp->ms_sm); metaslab_unload(msp); + range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); @@ -2583,7 +2608,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); - vdev_space_update(vd, 0, 0, msp->ms_size); + metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); @@ -2605,7 +2630,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta -= range_tree_space(*defer_tree); } - vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); + metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, + defer_delta, 0); /* * If there's a metaslab_load() in progress, wait for it to complete @@ -2704,21 +2730,25 @@ metaslab_sync_reassess(metaslab_group_t *mg) spa_config_exit(spa, SCL_ALLOC, FTAG); } -static uint64_t -metaslab_distance(metaslab_t *msp, dva_t *dva) +/* + * When writing a ditto block (i.e. more than one DVA for a given BP) on + * the same vdev as an existing DVA of this BP, then try to allocate it + * on a different metaslab than existing DVAs (i.e. a unique metaslab). + */ +static boolean_t +metaslab_is_unique(metaslab_t *msp, dva_t *dva) { - uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; - uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_id; + uint64_t dva_ms_id; + + if (DVA_GET_ASIZE(dva) == 0) + return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) - return (1ULL << 63); + return (B_TRUE); - if (offset < start) - return ((start - offset) << ms_shift); - if (offset > start) - return ((offset - start) << ms_shift); - return (0); + dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; + + return (msp->ms_id != dva_ms_id); } /* @@ -2978,7 +3008,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, - dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, + dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; @@ -3012,13 +3042,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; - uint64_t target_distance = min_distance - + (space_map_allocated(msp->ms_sm) != 0 ? 0 : - min_distance >> 1); - for (i = 0; i < d; i++) { - if (metaslab_distance(msp, &dva[i]) < target_distance) - break; + if (want_unique && + !metaslab_is_unique(msp, &dva[i])) + break; /* try another metaslab */ } if (i == d) break; @@ -3036,8 +3063,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, - int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, + int d, int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -3091,7 +3118,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - min_distance, asize, allocator, zal, search, + want_unique, asize, allocator, zal, search, &was_active); } @@ -3221,6 +3248,7 @@ next: * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize)); + mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); @@ -3230,14 +3258,14 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, - int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, + int d, int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, - min_distance, dva, d, allocator); + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, + dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -3265,14 +3293,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, } /* - * If we have to write a ditto block (i.e. more than one DVA for a given BP) - * on the same vdev as an existing DVA of this BP, then try to allocate it - * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the - * existing DVAs. - */ -int ditto_same_vdev_distance_shift = 3; - -/* * Allocate a block for the specified i/o. */ int @@ -3288,6 +3308,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. + * This will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, @@ -3348,6 +3369,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); } else { + ASSERT(mc->mc_rotor != NULL); mg = mc->mc_rotor; } @@ -3412,25 +3434,17 @@ top: ASSERT(mg->mg_class == mc); - /* - * If we don't need to try hard, then require that the - * block be 1/8th of the device away from any other DVAs - * in this BP. If we are trying hard, allow any offset - * to be used (distance=0). - */ - uint64_t distance = 0; - if (!try_hard) { - distance = vd->vdev_asize >> - ditto_same_vdev_distance_shift; - if (distance <= (1ULL << vd->vdev_ms_shift)) - distance = 0; - } - uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + /* + * If we don't need to try hard, then require that the + * block be on an different metaslab from any other DVAs + * in this BP (unique=true). If we are trying hard, then + * allow any metaslab to be used (unique=false). + */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - distance, dva, d, allocator); + !try_hard, dva, d, allocator); if (offset != -1ULL) { /* @@ -3830,7 +3844,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, if (reserved_slots < max) available_slots = max - reserved_slots; - if (slots <= available_slots || GANG_ALLOCATION(flags)) { + if (slots <= available_slots || GANG_ALLOCATION(flags) || + flags & METASLAB_MUST_RESERVE) { /* * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. @@ -4107,9 +4122,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) - if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + for (int d = 0; d < ndvas; d++) { + error = metaslab_claim_dva(spa, &dva[d], txg); + if (error != 0) break; + } spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -4235,7 +4252,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) } #if defined(_KERNEL) -/* CSTYLED */ +/* BEGIN CSTYLED */ module_param(metaslab_aliquot, ulong, 0644); MODULE_PARM_DESC(metaslab_aliquot, "allocation granularity (a.k.a. stripe size)"); @@ -4284,8 +4301,9 @@ module_param(zfs_metaslab_switch_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_switch_threshold, "segment-based metaslab selection maximum buckets before switching"); -/* CSTYLED */ module_param(metaslab_force_ganging, ulong, 0644); MODULE_PARM_DESC(metaslab_force_ganging, "blocks larger than this size are forced to be gang blocks"); +/* END CSTYLED */ + #endif diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 39f329bea..c503b06bc 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -31,6 +31,7 @@ * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ /* @@ -272,8 +273,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { - alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - size = metaslab_class_get_space(spa_normal_class(spa)); + alloc = metaslab_class_get_alloc(mc); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + + size = metaslab_class_get_space(mc); + size += metaslab_class_get_space(spa_special_class(spa)); + size += metaslab_class_get_space(spa_dedup_class(spa)); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); @@ -1173,6 +1180,8 @@ spa_activate(spa_t *spa, int mode) spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); @@ -1320,6 +1329,12 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; + metaslab_class_destroy(spa->spa_special_class); + spa->spa_special_class = NULL; + + metaslab_class_destroy(spa->spa_dedup_class); + spa->spa_dedup_class = NULL; + /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. @@ -4988,7 +5003,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, char *poolname; nvlist_t *nvl; - if (nvlist_lookup_string(props, "tname", &poolname) != 0) + if (props == NULL || + nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; /* @@ -5092,9 +5108,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_metaslab_set_size(rvd->vdev_child[c]); - vdev_expand(rvd->vdev_child[c], txg); + /* + * instantiate the metaslab groups (this will dirty the vdevs) + * we can no longer error exit past this point + */ + for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + vdev_metaslab_set_size(vd); + vdev_expand(vd, txg); } } @@ -6940,8 +6962,14 @@ spa_async_thread(void *arg) mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); + old_space += metaslab_class_get_space(spa_special_class(spa)); + old_space += metaslab_class_get_space(spa_dedup_class(spa)); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + new_space = metaslab_class_get_space(spa_normal_class(spa)); + new_space += metaslab_class_get_space(spa_special_class(spa)); + new_space += metaslab_class_get_space(spa_dedup_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -7630,6 +7658,9 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; + metaslab_class_t *normal = spa_normal_class(spa); + metaslab_class_t *special = spa_special_class(spa); + metaslab_class_t *dedup = spa_dedup_class(spa); vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; @@ -7723,9 +7754,13 @@ spa_sync(spa_t *spa, uint64_t txg) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; + metaslab_class_t *mc; + + if (mg == NULL || !metaslab_group_initialized(mg)) + continue; - if (mg == NULL || mg->mg_class != spa_normal_class(spa) || - !metaslab_group_initialized(mg)) + mc = mg->mg_class; + if (mc != normal && mc != special && mc != dedup) continue; /* @@ -7743,12 +7778,18 @@ spa_sync(spa_t *spa, uint64_t txg) } slots_per_allocator += zfs_vdev_def_queue_depth; } - metaslab_class_t *mc = spa_normal_class(spa); + for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); - mc->mc_alloc_max_slots[i] = slots_per_allocator; - } - mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + ASSERT0(refcount_count(&normal->mc_alloc_slots[i])); + ASSERT0(refcount_count(&special->mc_alloc_slots[i])); + ASSERT0(refcount_count(&dedup->mc_alloc_slots[i])); + normal->mc_alloc_max_slots[i] = slots_per_allocator; + special->mc_alloc_max_slots[i] = slots_per_allocator; + dedup->mc_alloc_max_slots[i] = slots_per_allocator; + } + normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 44ceb42d4..2c500c010 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -25,6 +25,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ #include <sys/zfs_context.h> @@ -409,6 +410,19 @@ spa_load_note(spa_t *spa, const char *fmt, ...) } /* + * By default dedup and user data indirects land in the special class + */ +int zfs_ddt_data_is_special = B_TRUE; +int zfs_user_indirect_is_special = B_TRUE; + +/* + * The percentage of special class final space reserved for metadata only. + * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only + * let metadata into the class. + */ +int zfs_special_class_metadata_reserve_pct = 25; + +/* * ========================================================================== * SPA config locking * ========================================================================== @@ -1159,6 +1173,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); @@ -1554,6 +1570,16 @@ zfs_strtonum(const char *str, char **nptr) return (val); } +void +spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We bump the feature refcount for each special vdev added to the pool + */ + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); + spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); +} + /* * ========================================================================== * Accessor functions @@ -1811,6 +1837,79 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +metaslab_class_t * +spa_special_class(spa_t *spa) +{ + return (spa->spa_special_class); +} + +metaslab_class_t * +spa_dedup_class(spa_t *spa) +{ + return (spa->spa_dedup_class); +} + +/* + * Locate an appropriate allocation class + */ +metaslab_class_t * +spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, + uint_t level, uint_t special_smallblk) +{ + if (DMU_OT_IS_ZIL(objtype)) { + if (spa->spa_log_class->mc_groups != 0) + return (spa_log_class(spa)); + else + return (spa_normal_class(spa)); + } + + boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; + + if (DMU_OT_IS_DDT(objtype)) { + if (spa->spa_dedup_class->mc_groups != 0) + return (spa_dedup_class(spa)); + else if (has_special_class && zfs_ddt_data_is_special) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + /* Indirect blocks for user data can land in special if allowed */ + if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { + if (has_special_class && zfs_user_indirect_is_special) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + if (DMU_OT_IS_METADATA(objtype) || level > 0) { + if (has_special_class) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + /* + * Allow small file blocks in special class in some cases (like + * for the dRAID vdev feature). But always leave a reserve of + * zfs_special_class_metadata_reserve_pct exclusively for metadata. + */ + if (DMU_OT_IS_FILE(objtype) && + has_special_class && size < special_smallblk) { + metaslab_class_t *special = spa_special_class(spa); + uint64_t alloc = metaslab_class_get_alloc(special); + uint64_t space = metaslab_class_get_space(special); + uint64_t limit = + (space * (100 - zfs_special_class_metadata_reserve_pct)) + / 100; + + if (alloc < limit) + return (special); + } + + return (spa_normal_class(spa)); +} + void spa_evicting_os_register(spa_t *spa, objset_t *os) { @@ -2500,6 +2599,8 @@ EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); +EXPORT_SYMBOL(spa_special_class); +EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); @@ -2579,5 +2680,13 @@ MODULE_PARM_DESC(spa_asize_inflation, module_param(spa_slop_shift, int, 0644); MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool"); + +module_param(zfs_ddt_data_is_special, int, 0644); +MODULE_PARM_DESC(zfs_ddt_data_is_special, + "Place DDT data into the special class"); + +module_param(zfs_user_indirect_is_special, int, 0644); +MODULE_PARM_DESC(zfs_user_indirect_is_special, + "Place user data indirect blocks into the special class"); /* END CSTYLED */ #endif diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f5c259bd4..dfe444368 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <[email protected]> * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ #include <sys/zfs_context.h> @@ -205,6 +206,25 @@ vdev_getops(const char *type) } /* + * Derive the enumerated alloction bias from string input. + * String origin is either the per-vdev zap or zpool(1M). + */ +static vdev_alloc_bias_t +vdev_derive_alloc_bias(const char *bias) +{ + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + + if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) + alloc_bias = VDEV_BIAS_LOG; + else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + alloc_bias = VDEV_BIAS_SPECIAL; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + alloc_bias = VDEV_BIAS_DEDUP; + + return (alloc_bias); +} + +/* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ @@ -528,6 +548,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vdev_indirect_config_t *vic; char *tmp = NULL; int rc; + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -614,11 +636,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } ASSERT(nparity != -1ULL); + /* + * If creating a top-level vdev, check for allocation classes input + */ + if (top_level && alloctype == VDEV_ALLOC_ADD) { + char *bias; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + + /* spa_vdev_add() expects feature to be enabled */ + if (spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES)) { + return (SET_ERROR(ENOTSUP)); + } + } + } + vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) + vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -687,7 +730,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && + if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); @@ -703,14 +746,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ASSERT0(vd->vdev_top_zap); } - if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { + if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); - vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd, - spa->spa_alloc_count); + /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && @@ -952,6 +993,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; + tvd->vdev_alloc_bias = svd->vdev_alloc_bias; + svd->vdev_alloc_bias = VDEV_BIAS_NONE; + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; @@ -1114,6 +1158,55 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +static void +vdev_metaslab_group_create(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + /* + * metaslab_group_create was delayed until allocation bias was available + */ + if (vd->vdev_mg == NULL) { + metaslab_class_t *mc; + + if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = VDEV_BIAS_LOG; + + ASSERT3U(vd->vdev_islog, ==, + (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + mc = spa_log_class(spa); + break; + case VDEV_BIAS_SPECIAL: + mc = spa_special_class(spa); + break; + case VDEV_BIAS_DEDUP: + mc = spa_dedup_class(spa); + break; + default: + mc = spa_normal_class(spa); + } + + vd->vdev_mg = metaslab_group_create(mc, vd, + spa->spa_alloc_count); + + /* + * The spa ashift values currently only reflect the + * general vdev classes. Class destination is late + * binding so ashift checking had to wait until now + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + } +} + int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { @@ -1124,6 +1217,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; + boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); @@ -1139,7 +1233,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); - if (oldc != 0) { + if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } @@ -1165,6 +1259,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } +#ifndef _KERNEL + /* + * To accomodate zdb_leak_init() fake indirect + * metaslabs, we allocate a metaslab group for + * indirect vdevs which normally don't have one. + */ + if (vd->vdev_mg == NULL) { + ASSERT0(vdev_is_concrete(vd)); + vdev_metaslab_group_create(vd); + } +#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1182,8 +1287,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ - if (oldc == 0 && !vd->vdev_removing) + if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -1673,9 +1779,13 @@ vdev_open(vdev_t *vd) /* * Track the min and max ashift values for normal data devices. + * + * DJB - TBD these should perhaps be tracked per allocation class + * (e.g. spa_min_ashift is used to round up post compression buffers) */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - !vd->vdev_islog && vd->vdev_aux == NULL) { + vd->vdev_alloc_bias == VDEV_BIAS_NONE && + vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) @@ -2561,6 +2671,30 @@ vdev_dtl_load(vdev_t *vd) return (error); } +static void +vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *string; + + ASSERT(alloc_bias != VDEV_BIAS_NONE); + + string = + (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; + + ASSERT(string != NULL); + VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(string) + 1, string, tx)); + + if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { + spa_activate_allocation_classes(spa, tx); + } +} + void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { @@ -2597,8 +2731,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) + vdev_zap_allocation_data(vd, tx); } } + for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } @@ -2802,9 +2939,26 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); /* + * On spa_load path, grab the allocation bias from our zap + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + char bias_str[64]; + + if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), + bias_str) == 0) { + ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); + vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + } + } + + /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -2999,6 +3153,7 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg) metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } @@ -3673,7 +3828,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { - vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + vs->vs_fragmentation = (vd->vdev_mg != NULL) ? + vd->vdev_mg->mg_fragmentation : 0; } } @@ -3878,19 +4034,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } +int64_t +vdev_deflated_space(vdev_t *vd, int64_t space) +{ + ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); + + return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); +} + /* - * Update the in-core space usage stats for this vdev, its metaslab class, - * and the root vdev. + * Update the in-core space usage stats for this vdev and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { - int64_t dspace_delta = space_delta; + int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - metaslab_group_t *mg = vd->vdev_mg; - metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -3900,10 +4062,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ - ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); - ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); - dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; @@ -3911,21 +4070,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (mc == spa_normal_class(spa)) { + /* every class but log contributes to root space stats */ + if (vd->vdev_mg != NULL && !vd->vdev_islog) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } - - if (mc != NULL) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - metaslab_class_space_update(mc, - alloc_delta, defer_delta, space_delta, dspace_delta); - } + /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* @@ -4354,7 +4507,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 29d7d651b..439ab7438 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -22,6 +22,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ /* @@ -463,6 +465,28 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } + + /* zpool command expects alloc class data */ + if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { + const char *bias = NULL; + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + bias = VDEV_ALLOC_BIAS_LOG; + break; + case VDEV_BIAS_SPECIAL: + bias = VDEV_ALLOC_BIAS_SPECIAL; + break; + case VDEV_BIAS_DEDUP: + bias = VDEV_ALLOC_BIAS_DEDUP; + break; + default: + ASSERT3U(vd->vdev_alloc_bias, ==, + VDEV_BIAS_NONE); + } + fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + bias); + } } if (vd->vdev_dtl_sm != NULL) { diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index dcce93c70..9db6fe37b 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -944,8 +944,18 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, } ASSERT3U(size, <=, maxalloc); - int error = metaslab_alloc_dva(spa, mg->mg_class, size, - &dst, 0, NULL, txg, 0, zal, 0); + /* + * An allocation class might not have any remaining vdevs or space + */ + metaslab_class_t *mc = mg->mg_class; + if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) + mc = spa_normal_class(spa); + int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, + zal, 0); + if (error == ENOSPC && mc != spa_normal_class(spa)) { + error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, + &dst, 0, NULL, txg, 0, zal, 0); + } if (error != 0) return (error); @@ -1853,15 +1863,31 @@ spa_vdev_remove_top_check(vdev_t *vd) if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); + /* available space in the pool's normal class */ + uint64_t available = dsl_dir_space_available( + spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); + + metaslab_class_t *mc = vd->vdev_mg->mg_class; + + /* + * When removing a vdev from an allocation class that has + * remaining vdevs, include available space from the class. + */ + if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { + uint64_t class_avail = metaslab_class_get_space(mc) - + metaslab_class_get_alloc(mc); + + /* add class space, adjusted for overhead */ + available += (class_avail * 94) / 100; + } + /* * There has to be enough free space to remove the * device and leave double the "slop" space (i.e. we * must leave at least 3% of the pool free, in addition to * the normal slop space). */ - if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, - NULL, 0, B_TRUE) < - vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { return (SET_ERROR(ENOSPC)); } diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c index e2aff28e1..ca79893c9 100644 --- a/module/zfs/zfs_debug.c +++ b/module/zfs/zfs_debug.c @@ -133,11 +133,15 @@ zfs_dbgmsg_fini(void) { if (zfs_dbgmsg_kstat) kstat_delete(zfs_dbgmsg_kstat); - + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL mutex_enter(&zfs_dbgmsgs_lock); zfs_dbgmsg_purge(0); mutex_exit(&zfs_dbgmsgs_lock); mutex_destroy(&zfs_dbgmsgs_lock); +#endif } void diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 633d738aa..fc0fbbf59 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4191,6 +4191,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } break; + case ZFS_PROP_SPECIAL_SMALL_BLOCKS: + /* + * This property could require the allocation classes + * feature to be active for setting, however we allow + * it so that tests of settable properties succeed. + * The CLI will issue a warning in this case. + */ + break; + case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 654c81ef9..88bd7831e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ #include <sys/sysmacros.h> @@ -825,6 +826,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + if (zio->io_metaslab_class == NULL) + zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) @@ -1315,9 +1318,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { - ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa)); - - ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(pio->io_metaslab_class != NULL); + ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); @@ -1644,8 +1646,9 @@ zio_write_compress(zio_t *zio) if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { - ASSERT(psize != 0); + VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { @@ -3266,7 +3269,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); - if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } @@ -3282,9 +3285,14 @@ zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; + metaslab_class_t *mc; + + /* locate an appropriate allocation class */ + mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, + zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || - !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); @@ -3306,17 +3314,15 @@ zio_dva_throttle(zio_t *zio) zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio->io_metaslab_class = mc; avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - - nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + nio = zio_io_to_allocate(spa, zio->io_allocator); mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); - return (nio); } -void +static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; @@ -3336,7 +3342,7 @@ static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; - metaslab_class_t *mc = spa_normal_class(spa); + metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; @@ -3360,10 +3366,50 @@ zio_dva_allocate(zio_t *zio) if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; + /* + * if not already chosen, locate an appropriate allocation class + */ + mc = zio->io_metaslab_class; + if (mc == NULL) { + mc = spa_preferred_class(spa, zio->io_size, + zio->io_prop.zp_type, zio->io_prop.zp_level, + zio->io_prop.zp_zpl_smallblk); + zio->io_metaslab_class = mc; + } + error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); + /* + * Fallback to normal class when an alloc class is full + */ + if (error == ENOSPC && mc != spa_normal_class(spa)) { + /* + * If throttling, transfer reservation over to normal class. + * The io_allocator slot can remain the same even though we + * are switching classes. + */ + if (mc->mc_alloc_throttle_enabled && + (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { + metaslab_class_throttle_unreserve(mc, + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; + + mc = spa_normal_class(spa); + VERIFY(metaslab_class_throttle_reserve(mc, + zio->io_prop.zp_copies, zio->io_allocator, zio, + flags | METASLAB_MUST_RESERVE)); + } else { + mc = spa_normal_class(spa); + } + zio->io_metaslab_class = mc; + + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio, zio->io_allocator); + } + if (error != 0) { zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, @@ -3431,6 +3477,15 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); + + /* + * Block pointer fields are useful to metaslabs for stats and debugging. + * Fill in the obvious ones before calling into metaslab_alloc(). + */ + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_PSIZE(new_bp, size); + BP_SET_LEVEL(new_bp, 0); + /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part @@ -4144,13 +4199,15 @@ zio_ready(zio_t *zio) if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(zio->io_metaslab_class != NULL); + /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( - spa_normal_class(zio->io_spa), - zio->io_prop.zp_copies, zio->io_allocator, zio); + zio->io_metaslab_class, zio->io_prop.zp_copies, + zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } @@ -4233,14 +4290,15 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); - metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), - 1, pio->io_allocator, pio); + metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, + pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that @@ -4259,7 +4317,6 @@ zio_done(zio_t *zio) */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; - ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa)); zio_link_t *zl = NULL; /* @@ -4278,7 +4335,8 @@ zio_done(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { - ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(zio->io_metaslab_class != NULL); + ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } @@ -4290,9 +4348,11 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); + metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); - VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator], + VERIFY(refcount_not_held( + &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], zio)); } |