summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorDon Brady <[email protected]>2018-09-05 19:33:36 -0600
committerBrian Behlendorf <[email protected]>2018-09-05 18:33:36 -0700
commitcc99f275a28c43fe450a66a7544f73c4935f7361 (patch)
treef867e1d2cbb550a047c0f87986831252c41a2fd9 /module/zfs
parentcfa37548ebc880580782b245f2d233ed540e7a01 (diff)
Pool allocation classes
Allocation Classes add the ability to have allocation classes in a pool that are dedicated to serving specific block categories, such as DDT data, metadata, and small file blocks. A pool can opt-in to this feature by adding a 'special' or 'dedup' top-level VDEV. Reviewed by: Pavel Zakharov <[email protected]> Reviewed-by: Richard Laager <[email protected]> Reviewed-by: Alek Pinchuk <[email protected]> Reviewed-by: HÃ¥kan Johansson <[email protected]> Reviewed-by: Andreas Dilger <[email protected]> Reviewed-by: DHE <[email protected]> Reviewed-by: Richard Elling <[email protected]> Reviewed-by: Gregor Kopka <[email protected]> Reviewed-by: Kash Pande <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Matthew Ahrens <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #5182
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/dmu.c2
-rw-r--r--module/zfs/dmu_objset.c20
-rw-r--r--module/zfs/metaslab.c150
-rw-r--r--module/zfs/spa.c67
-rw-r--r--module/zfs/spa_misc.c109
-rw-r--r--module/zfs/vdev.c211
-rw-r--r--module/zfs/vdev_label.c24
-rw-r--r--module/zfs/vdev_removal.c38
-rw-r--r--module/zfs/zfs_debug.c6
-rw-r--r--module/zfs/zfs_ioctl.c9
-rw-r--r--module/zfs/zio.c98
11 files changed, 601 insertions, 133 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 88b574d4a..8779eb358 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2281,6 +2281,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+ zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
+ os->os_zpl_special_smallblock : 0;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 9adda320f..3c9a817f7 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -315,6 +315,20 @@ dnodesize_changed_cb(void *arg, uint64_t newval)
}
static void
+smallblk_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ os->os_zpl_special_smallblock = newval;
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -556,6 +570,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_DNODESIZE),
dnodesize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS),
+ smallblk_changed_cb, os);
+ }
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index f4c01497f..ac361abb6 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,8 +20,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -300,7 +301,7 @@ metaslab_class_validate(metaslab_class_t *mc)
return (0);
}
-void
+static void
metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
@@ -337,7 +338,8 @@ metaslab_class_get_dspace(metaslab_class_t *mc)
void
metaslab_class_histogram_verify(metaslab_class_t *mc)
{
- vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ spa_t *spa = mc->mc_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
uint64_t *mc_hist;
int i;
@@ -834,7 +836,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
- if (msp->ms_sm == NULL)
+ /* skip if not active or not a member */
+ if (msp->ms_sm == NULL || msp->ms_group != mg)
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
@@ -967,12 +970,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
+ if (msp->ms_group != mg)
+ continue;
valid_ms++;
fragmentation += msp->ms_fragmentation;
}
- if (valid_ms <= vd->vdev_ms_count / 2)
+ if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms;
@@ -1003,7 +1008,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
- if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+ if ((mc != spa_normal_class(spa) &&
+ mc != spa_special_class(spa) &&
+ mc != spa_dedup_class(spa)) ||
+ mc->mc_groups <= 1)
return (B_TRUE);
/*
@@ -1466,12 +1474,26 @@ metaslab_unload(metaslab_t *msp)
msp->ms_max_size = 0;
}
+static void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta)
+{
+ vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+ ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+ vdev_deflated_space(vd, space_delta));
+}
+
int
metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
metaslab_t **msp)
{
vdev_t *vd = mg->mg_vd;
- objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
metaslab_t *ms;
int error;
@@ -1528,8 +1550,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
/*
* If metaslab_debug_load is set and we're initializing a metaslab
- * that has an allocated space map object then load the its space
- * map so that can verify frees.
+ * that has an allocated space map object then load the space map
+ * so that we can verify frees.
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
mutex_enter(&ms->ms_lock);
@@ -1551,16 +1573,19 @@ void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
- vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
- 0, -msp->ms_size);
+ metaslab_space_update(vd, mg->mg_class,
+ -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+
space_map_close(msp->ms_sm);
metaslab_unload(msp);
+
range_tree_destroy(msp->ms_allocatable);
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);
@@ -2583,7 +2608,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT3P(msp->ms_checkpointing, ==, NULL);
msp->ms_checkpointing = range_tree_create(NULL, NULL);
- vdev_space_update(vd, 0, 0, msp->ms_size);
+ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_checkpointing));
@@ -2605,7 +2630,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
defer_delta -= range_tree_space(*defer_tree);
}
- vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
+ metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+ defer_delta, 0);
/*
* If there's a metaslab_load() in progress, wait for it to complete
@@ -2704,21 +2730,25 @@ metaslab_sync_reassess(metaslab_group_t *mg)
spa_config_exit(spa, SCL_ALLOC, FTAG);
}
-static uint64_t
-metaslab_distance(metaslab_t *msp, dva_t *dva)
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
{
- uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
- uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
- uint64_t start = msp->ms_id;
+ uint64_t dva_ms_id;
+
+ if (DVA_GET_ASIZE(dva) == 0)
+ return (B_TRUE);
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
- return (1ULL << 63);
+ return (B_TRUE);
- if (offset < start)
- return ((start - offset) << ms_shift);
- if (offset > start)
- return ((offset - start) << ms_shift);
- return (0);
+ dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+ return (msp->ms_id != dva_ms_id);
}
/*
@@ -2978,7 +3008,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
*/
static metaslab_t *
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
- dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+ dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
{
avl_index_t idx;
@@ -3012,13 +3042,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
break;
- uint64_t target_distance = min_distance
- + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
- min_distance >> 1);
-
for (i = 0; i < d; i++) {
- if (metaslab_distance(msp, &dva[i]) < target_distance)
- break;
+ if (want_unique &&
+ !metaslab_is_unique(msp, &dva[i]))
+ break; /* try another metaslab */
}
if (i == d)
break;
@@ -3036,8 +3063,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
- int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
@@ -3091,7 +3118,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
was_active = B_TRUE;
} else {
msp = find_valid_metaslab(mg, activation_weight, dva, d,
- min_distance, asize, allocator, zal, search,
+ want_unique, asize, allocator, zal, search,
&was_active);
}
@@ -3221,6 +3248,7 @@ next:
* metaslab.
*/
ASSERT(!metaslab_should_allocate(msp, asize));
+
mutex_exit(&msp->ms_lock);
}
mutex_exit(&msp->ms_lock);
@@ -3230,14 +3258,14 @@ next:
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
- int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
- offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
- min_distance, dva, d, allocator);
+ offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+ dva, d, allocator);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
@@ -3265,14 +3293,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
}
/*
- * If we have to write a ditto block (i.e. more than one DVA for a given BP)
- * on the same vdev as an existing DVA of this BP, then try to allocate it
- * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
- * existing DVAs.
- */
-int ditto_same_vdev_distance_shift = 3;
-
-/*
* Allocate a block for the specified i/o.
*/
int
@@ -3288,6 +3308,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* For testing, make some blocks above a certain size be gang blocks.
+ * This will also test spilling from special to normal.
*/
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
@@ -3348,6 +3369,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
} else {
+ ASSERT(mc->mc_rotor != NULL);
mg = mc->mc_rotor;
}
@@ -3412,25 +3434,17 @@ top:
ASSERT(mg->mg_class == mc);
- /*
- * If we don't need to try hard, then require that the
- * block be 1/8th of the device away from any other DVAs
- * in this BP. If we are trying hard, allow any offset
- * to be used (distance=0).
- */
- uint64_t distance = 0;
- if (!try_hard) {
- distance = vd->vdev_asize >>
- ditto_same_vdev_distance_shift;
- if (distance <= (1ULL << vd->vdev_ms_shift))
- distance = 0;
- }
-
uint64_t asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+ /*
+ * If we don't need to try hard, then require that the
+ * block be on an different metaslab from any other DVAs
+ * in this BP (unique=true). If we are trying hard, then
+ * allow any metaslab to be used (unique=false).
+ */
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
- distance, dva, d, allocator);
+ !try_hard, dva, d, allocator);
if (offset != -1ULL) {
/*
@@ -3830,7 +3844,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
if (reserved_slots < max)
available_slots = max - reserved_slots;
- if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+ if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+ flags & METASLAB_MUST_RESERVE) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
@@ -4107,9 +4122,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
- for (int d = 0; d < ndvas; d++)
- if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_claim_dva(spa, &dva[d], txg);
+ if (error != 0)
break;
+ }
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -4235,7 +4252,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
}
#if defined(_KERNEL)
-/* CSTYLED */
+/* BEGIN CSTYLED */
module_param(metaslab_aliquot, ulong, 0644);
MODULE_PARM_DESC(metaslab_aliquot,
"allocation granularity (a.k.a. stripe size)");
@@ -4284,8 +4301,9 @@ module_param(zfs_metaslab_switch_threshold, int, 0644);
MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
"segment-based metaslab selection maximum buckets before switching");
-/* CSTYLED */
module_param(metaslab_force_ganging, ulong, 0644);
MODULE_PARM_DESC(metaslab_force_ganging,
"blocks larger than this size are forced to be gang blocks");
+/* END CSTYLED */
+
#endif
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 39f329bea..c503b06bc 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -31,6 +31,7 @@
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
/*
@@ -272,8 +273,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (rvd != NULL) {
- alloc = metaslab_class_get_alloc(spa_normal_class(spa));
- size = metaslab_class_get_space(spa_normal_class(spa));
+ alloc = metaslab_class_get_alloc(mc);
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+ size = metaslab_class_get_space(mc);
+ size += metaslab_class_get_space(spa_special_class(spa));
+ size += metaslab_class_get_space(spa_dedup_class(spa));
+
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
@@ -1173,6 +1180,8 @@ spa_activate(spa_t *spa, int mode)
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
/* Try to create a covering process */
mutex_enter(&spa->spa_proc_lock);
@@ -1320,6 +1329,12 @@ spa_deactivate(spa_t *spa)
metaslab_class_destroy(spa->spa_log_class);
spa->spa_log_class = NULL;
+ metaslab_class_destroy(spa->spa_special_class);
+ spa->spa_special_class = NULL;
+
+ metaslab_class_destroy(spa->spa_dedup_class);
+ spa->spa_dedup_class = NULL;
+
/*
* If this was part of an import or the open otherwise failed, we may
* still have errors left in the queues. Empty them just in case.
@@ -4988,7 +5003,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
char *poolname;
nvlist_t *nvl;
- if (nvlist_lookup_string(props, "tname", &poolname) != 0)
+ if (props == NULL ||
+ nvlist_lookup_string(props, "tname", &poolname) != 0)
poolname = (char *)pool;
/*
@@ -5092,9 +5108,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_metaslab_set_size(rvd->vdev_child[c]);
- vdev_expand(rvd->vdev_child[c], txg);
+ /*
+ * instantiate the metaslab groups (this will dirty the vdevs)
+ * we can no longer error exit past this point
+ */
+ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ vdev_metaslab_set_size(vd);
+ vdev_expand(vd, txg);
}
}
@@ -6940,8 +6962,14 @@ spa_async_thread(void *arg)
mutex_enter(&spa_namespace_lock);
old_space = metaslab_class_get_space(spa_normal_class(spa));
+ old_space += metaslab_class_get_space(spa_special_class(spa));
+ old_space += metaslab_class_get_space(spa_dedup_class(spa));
+
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
new_space = metaslab_class_get_space(spa_normal_class(spa));
+ new_space += metaslab_class_get_space(spa_special_class(spa));
+ new_space += metaslab_class_get_space(spa_dedup_class(spa));
mutex_exit(&spa_namespace_lock);
/*
@@ -7630,6 +7658,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
@@ -7723,9 +7754,13 @@ spa_sync(spa_t *spa, uint64_t txg)
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
+ metaslab_class_t *mc;
+
+ if (mg == NULL || !metaslab_group_initialized(mg))
+ continue;
- if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
- !metaslab_group_initialized(mg))
+ mc = mg->mg_class;
+ if (mc != normal && mc != special && mc != dedup)
continue;
/*
@@ -7743,12 +7778,18 @@ spa_sync(spa_t *spa, uint64_t txg)
}
slots_per_allocator += zfs_vdev_def_queue_depth;
}
- metaslab_class_t *mc = spa_normal_class(spa);
+
for (int i = 0; i < spa->spa_alloc_count; i++) {
- ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
- mc->mc_alloc_max_slots[i] = slots_per_allocator;
- }
- mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ ASSERT0(refcount_count(&normal->mc_alloc_slots[i]));
+ ASSERT0(refcount_count(&special->mc_alloc_slots[i]));
+ ASSERT0(refcount_count(&dedup->mc_alloc_slots[i]));
+ normal->mc_alloc_max_slots[i] = slots_per_allocator;
+ special->mc_alloc_max_slots[i] = slots_per_allocator;
+ dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+ }
+ normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 44ceb42d4..2c500c010 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -25,6 +25,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -409,6 +410,19 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
}
/*
+ * By default dedup and user data indirects land in the special class
+ */
+int zfs_ddt_data_is_special = B_TRUE;
+int zfs_user_indirect_is_special = B_TRUE;
+
+/*
+ * The percentage of special class final space reserved for metadata only.
+ * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
+ * let metadata into the class.
+ */
+int zfs_special_class_metadata_reserve_pct = 25;
+
+/*
* ==========================================================================
* SPA config locking
* ==========================================================================
@@ -1159,6 +1173,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
*/
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
spa_config_exit(spa, SCL_ALL, spa);
@@ -1554,6 +1570,16 @@ zfs_strtonum(const char *str, char **nptr)
return (val);
}
+void
+spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We bump the feature refcount for each special vdev added to the pool
+ */
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
+ spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+}
+
/*
* ==========================================================================
* Accessor functions
@@ -1811,6 +1837,79 @@ spa_log_class(spa_t *spa)
return (spa->spa_log_class);
}
+metaslab_class_t *
+spa_special_class(spa_t *spa)
+{
+ return (spa->spa_special_class);
+}
+
+metaslab_class_t *
+spa_dedup_class(spa_t *spa)
+{
+ return (spa->spa_dedup_class);
+}
+
+/*
+ * Locate an appropriate allocation class
+ */
+metaslab_class_t *
+spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
+ uint_t level, uint_t special_smallblk)
+{
+ if (DMU_OT_IS_ZIL(objtype)) {
+ if (spa->spa_log_class->mc_groups != 0)
+ return (spa_log_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
+
+ if (DMU_OT_IS_DDT(objtype)) {
+ if (spa->spa_dedup_class->mc_groups != 0)
+ return (spa_dedup_class(spa));
+ else if (has_special_class && zfs_ddt_data_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /* Indirect blocks for user data can land in special if allowed */
+ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
+ if (has_special_class && zfs_user_indirect_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ if (DMU_OT_IS_METADATA(objtype) || level > 0) {
+ if (has_special_class)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /*
+ * Allow small file blocks in special class in some cases (like
+ * for the dRAID vdev feature). But always leave a reserve of
+ * zfs_special_class_metadata_reserve_pct exclusively for metadata.
+ */
+ if (DMU_OT_IS_FILE(objtype) &&
+ has_special_class && size < special_smallblk) {
+ metaslab_class_t *special = spa_special_class(spa);
+ uint64_t alloc = metaslab_class_get_alloc(special);
+ uint64_t space = metaslab_class_get_space(special);
+ uint64_t limit =
+ (space * (100 - zfs_special_class_metadata_reserve_pct))
+ / 100;
+
+ if (alloc < limit)
+ return (special);
+ }
+
+ return (spa_normal_class(spa));
+}
+
void
spa_evicting_os_register(spa_t *spa, objset_t *os)
{
@@ -2500,6 +2599,8 @@ EXPORT_SYMBOL(spa_update_dspace);
EXPORT_SYMBOL(spa_deflate);
EXPORT_SYMBOL(spa_normal_class);
EXPORT_SYMBOL(spa_log_class);
+EXPORT_SYMBOL(spa_special_class);
+EXPORT_SYMBOL(spa_preferred_class);
EXPORT_SYMBOL(spa_max_replication);
EXPORT_SYMBOL(spa_prev_software_version);
EXPORT_SYMBOL(spa_get_failmode);
@@ -2579,5 +2680,13 @@ MODULE_PARM_DESC(spa_asize_inflation,
module_param(spa_slop_shift, int, 0644);
MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool");
+
+module_param(zfs_ddt_data_is_special, int, 0644);
+MODULE_PARM_DESC(zfs_ddt_data_is_special,
+ "Place DDT data into the special class");
+
+module_param(zfs_user_indirect_is_special, int, 0644);
+MODULE_PARM_DESC(zfs_user_indirect_is_special,
+ "Place user data indirect blocks into the special class");
/* END CSTYLED */
#endif
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index f5c259bd4..dfe444368 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -26,6 +26,7 @@
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <[email protected]>
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -205,6 +206,25 @@ vdev_getops(const char *type)
}
/*
+ * Derive the enumerated alloction bias from string input.
+ * String origin is either the per-vdev zap or zpool(1M).
+ */
+static vdev_alloc_bias_t
+vdev_derive_alloc_bias(const char *bias)
+{
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+
+ if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
+ alloc_bias = VDEV_BIAS_LOG;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ alloc_bias = VDEV_BIAS_SPECIAL;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
+ alloc_bias = VDEV_BIAS_DEDUP;
+
+ return (alloc_bias);
+}
+
+/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
*/
@@ -528,6 +548,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
vdev_indirect_config_t *vic;
char *tmp = NULL;
int rc;
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+ boolean_t top_level = (parent && !parent->vdev_parent);
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
@@ -614,11 +636,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
}
ASSERT(nparity != -1ULL);
+ /*
+ * If creating a top-level vdev, check for allocation classes input
+ */
+ if (top_level && alloctype == VDEV_ALLOC_ADD) {
+ char *bias;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ alloc_bias = vdev_derive_alloc_bias(bias);
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa,
+ SPA_FEATURE_ALLOCATION_CLASSES)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+ }
+
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
+ if (top_level && alloc_bias != VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = alloc_bias;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
@@ -687,7 +730,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent &&
+ if (top_level &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
@@ -703,14 +746,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
ASSERT0(vd->vdev_top_zap);
}
- if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
+ if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
alloctype == VDEV_ALLOC_ADD ||
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
- vd->vdev_mg = metaslab_group_create(islog ?
- spa_log_class(spa) : spa_normal_class(spa), vd,
- spa->spa_alloc_count);
+ /* Note: metaslab_group_create() is now deferred */
}
if (vd->vdev_ops->vdev_op_leaf &&
@@ -952,6 +993,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
svd->vdev_checkpoint_sm = NULL;
+ tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
+ svd->vdev_alloc_bias = VDEV_BIAS_NONE;
+
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@@ -1114,6 +1158,55 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
+static void
+vdev_metaslab_group_create(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /*
+ * metaslab_group_create was delayed until allocation bias was available
+ */
+ if (vd->vdev_mg == NULL) {
+ metaslab_class_t *mc;
+
+ if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = VDEV_BIAS_LOG;
+
+ ASSERT3U(vd->vdev_islog, ==,
+ (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ mc = spa_log_class(spa);
+ break;
+ case VDEV_BIAS_SPECIAL:
+ mc = spa_special_class(spa);
+ break;
+ case VDEV_BIAS_DEDUP:
+ mc = spa_dedup_class(spa);
+ break;
+ default:
+ mc = spa_normal_class(spa);
+ }
+
+ vd->vdev_mg = metaslab_group_create(mc, vd,
+ spa->spa_alloc_count);
+
+ /*
+ * The spa ashift values currently only reflect the
+ * general vdev classes. Class destination is late
+ * binding so ashift checking had to wait until now
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+ }
+}
+
int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
@@ -1124,6 +1217,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
+ boolean_t expanding = (oldc != 0);
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
@@ -1139,7 +1233,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
- if (oldc != 0) {
+ if (expanding) {
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
@@ -1165,6 +1259,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
}
}
+#ifndef _KERNEL
+ /*
+ * To accomodate zdb_leak_init() fake indirect
+ * metaslabs, we allocate a metaslab group for
+ * indirect vdevs which normally don't have one.
+ */
+ if (vd->vdev_mg == NULL) {
+ ASSERT0(vdev_is_concrete(vd));
+ vdev_metaslab_group_create(vd);
+ }
+#endif
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error != 0) {
@@ -1182,8 +1287,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
* the metaslabs since we want to ensure that no new
* allocations are performed on this device.
*/
- if (oldc == 0 && !vd->vdev_removing)
+ if (!expanding && !vd->vdev_removing) {
metaslab_group_activate(vd->vdev_mg);
+ }
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -1673,9 +1779,13 @@ vdev_open(vdev_t *vd)
/*
* Track the min and max ashift values for normal data devices.
+ *
+ * DJB - TBD these should perhaps be tracked per allocation class
+ * (e.g. spa_min_ashift is used to round up post compression buffers)
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
- !vd->vdev_islog && vd->vdev_aux == NULL) {
+ vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
+ vd->vdev_aux == NULL) {
if (vd->vdev_ashift > spa->spa_max_ashift)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
@@ -2561,6 +2671,30 @@ vdev_dtl_load(vdev_t *vd)
return (error);
}
+static void
+vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *string;
+
+ ASSERT(alloc_bias != VDEV_BIAS_NONE);
+
+ string =
+ (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
+ (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+ (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
+
+ ASSERT(string != NULL);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
+ 1, strlen(string) + 1, string, tx));
+
+ if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
+ spa_activate_allocation_classes(spa, tx);
+ }
+}
+
void
vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
{
@@ -2597,8 +2731,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
}
if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
+ vdev_zap_allocation_data(vd, tx);
}
}
+
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
}
@@ -2802,9 +2939,26 @@ vdev_load(vdev_t *vd)
vdev_set_deflate_ratio(vd);
/*
+ * On spa_load path, grab the allocation bias from our zap
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ char bias_str[64];
+
+ if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
+ bias_str) == 0) {
+ ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
+ vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+ }
+ }
+
+ /*
* If this is a top-level vdev, initialize its metaslabs.
*/
if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+
if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -2999,6 +3153,7 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg)
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
+
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
}
@@ -3673,7 +3828,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
}
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
- vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+ vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+ vd->vdev_mg->mg_fragmentation : 0;
}
}
@@ -3878,19 +4034,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
+int64_t
+vdev_deflated_space(vdev_t *vd, int64_t space)
+{
+ ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
+
+ return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
+}
+
/*
- * Update the in-core space usage stats for this vdev, its metaslab class,
- * and the root vdev.
+ * Update the in-core space usage stats for this vdev and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
{
- int64_t dspace_delta = space_delta;
+ int64_t dspace_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
- metaslab_group_t *mg = vd->vdev_mg;
- metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -3900,10 +4062,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
* because the root vdev's psize-to-asize is simply the max of its
* childrens', thus not accurate enough for us.
*/
- ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
- ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
- dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
+ dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_alloc += alloc_delta;
@@ -3911,21 +4070,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (mc == spa_normal_class(spa)) {
+ /* every class but log contributes to root space stats */
+ if (vd->vdev_mg != NULL && !vd->vdev_islog) {
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
-
- if (mc != NULL) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- metaslab_class_space_update(mc,
- alloc_delta, defer_delta, space_delta, dspace_delta);
- }
+ /* Note: metaslab_class_space_update moved to metaslab_space_update */
}
/*
@@ -4354,7 +4507,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 29d7d651b..439ab7438 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -22,6 +22,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
/*
@@ -463,6 +465,28 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
vd->vdev_removing);
}
+
+ /* zpool command expects alloc class data */
+ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+ const char *bias = NULL;
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ bias = VDEV_ALLOC_BIAS_LOG;
+ break;
+ case VDEV_BIAS_SPECIAL:
+ bias = VDEV_ALLOC_BIAS_SPECIAL;
+ break;
+ case VDEV_BIAS_DEDUP:
+ bias = VDEV_ALLOC_BIAS_DEDUP;
+ break;
+ default:
+ ASSERT3U(vd->vdev_alloc_bias, ==,
+ VDEV_BIAS_NONE);
+ }
+ fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ bias);
+ }
}
if (vd->vdev_dtl_sm != NULL) {
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index dcce93c70..9db6fe37b 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -944,8 +944,18 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
}
ASSERT3U(size, <=, maxalloc);
- int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal, 0);
+ /*
+ * An allocation class might not have any remaining vdevs or space
+ */
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
if (error != 0)
return (error);
@@ -1853,15 +1863,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+ /*
+ * When removing a vdev from an allocation class that has
+ * remaining vdevs, include available space from the class.
+ */
+ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+ uint64_t class_avail = metaslab_class_get_space(mc) -
+ metaslab_class_get_alloc(mc);
+
+ /* add class space, adjusted for overhead */
+ available += (class_avail * 94) / 100;
+ }
+
/*
* There has to be enough free space to remove the
* device and leave double the "slop" space (i.e. we
* must leave at least 3% of the pool free, in addition to
* the normal slop space).
*/
- if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
- NULL, 0, B_TRUE) <
- vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
return (SET_ERROR(ENOSPC));
}
diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c
index e2aff28e1..ca79893c9 100644
--- a/module/zfs/zfs_debug.c
+++ b/module/zfs/zfs_debug.c
@@ -133,11 +133,15 @@ zfs_dbgmsg_fini(void)
{
if (zfs_dbgmsg_kstat)
kstat_delete(zfs_dbgmsg_kstat);
-
+ /*
+ * TODO - decide how to make this permanent
+ */
+#ifdef _KERNEL
mutex_enter(&zfs_dbgmsgs_lock);
zfs_dbgmsg_purge(0);
mutex_exit(&zfs_dbgmsgs_lock);
mutex_destroy(&zfs_dbgmsgs_lock);
+#endif
}
void
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 633d738aa..fc0fbbf59 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4191,6 +4191,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
+ case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+ /*
+ * This property could require the allocation classes
+ * feature to be active for setting, however we allow
+ * it so that tests of settable properties succeed.
+ * The CLI will issue a warning in this case.
+ */
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 654c81ef9..88bd7831e 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/sysmacros.h>
@@ -825,6 +826,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
+ if (zio->io_metaslab_class == NULL)
+ zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -1315,9 +1318,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/
if (flags & ZIO_FLAG_IO_ALLOCATING &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
- ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa));
-
- ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(pio->io_metaslab_class != NULL);
+ ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
@@ -1644,8 +1646,9 @@ zio_write_compress(zio_t *zio)
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
- ASSERT(psize != 0);
+ VERIFY3U(psize, !=, 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
@@ -3266,7 +3269,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
* reserve then we throttle.
*/
ASSERT3U(zio->io_allocator, ==, allocator);
- if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
return (NULL);
}
@@ -3282,9 +3285,14 @@ zio_dva_throttle(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *nio;
+ metaslab_class_t *mc;
+
+ /* locate an appropriate allocation class */
+ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
+ zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
- !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+ !mc->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
return (zio);
@@ -3306,17 +3314,15 @@ zio_dva_throttle(zio_t *zio)
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
-
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
-
- nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+ nio = zio_io_to_allocate(spa, zio->io_allocator);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
-
return (nio);
}
-void
+static void
zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
@@ -3336,7 +3342,7 @@ static zio_t *
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- metaslab_class_t *mc = spa_normal_class(spa);
+ metaslab_class_t *mc;
blkptr_t *bp = zio->io_bp;
int error;
int flags = 0;
@@ -3360,10 +3366,50 @@ zio_dva_allocate(zio_t *zio)
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
flags |= METASLAB_ASYNC_ALLOC;
+ /*
+ * if not already chosen, locate an appropriate allocation class
+ */
+ mc = zio->io_metaslab_class;
+ if (mc == NULL) {
+ mc = spa_preferred_class(spa, zio->io_size,
+ zio->io_prop.zp_type, zio->io_prop.zp_level,
+ zio->io_prop.zp_zpl_smallblk);
+ zio->io_metaslab_class = mc;
+ }
+
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
+ /*
+ * Fallback to normal class when an alloc class is full
+ */
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ /*
+ * If throttling, transfer reservation over to normal class.
+ * The io_allocator slot can remain the same even though we
+ * are switching classes.
+ */
+ if (mc->mc_alloc_throttle_enabled &&
+ (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
+ metaslab_class_throttle_unreserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+
+ mc = spa_normal_class(spa);
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio,
+ flags | METASLAB_MUST_RESERVE));
+ } else {
+ mc = spa_normal_class(spa);
+ }
+ zio->io_metaslab_class = mc;
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+ }
+
if (error != 0) {
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
@@ -3431,6 +3477,15 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
+
+ /*
+ * Block pointer fields are useful to metaslabs for stats and debugging.
+ * Fill in the obvious ones before calling into metaslab_alloc().
+ */
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_LEVEL(new_bp, 0);
+
/*
* When allocating a zil block, we don't have information about
* the final destination of the block except the objset it's part
@@ -4144,13 +4199,15 @@ zio_ready(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
/*
* We were unable to allocate anything, unreserve and
* issue the next I/O to allocate.
*/
metaslab_class_throttle_unreserve(
- spa_normal_class(zio->io_spa),
- zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_metaslab_class, zio->io_prop.zp_copies,
+ zio->io_allocator, zio);
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
}
}
@@ -4233,14 +4290,15 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
mutex_enter(&pio->io_lock);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
pio->io_allocator, B_TRUE);
mutex_exit(&pio->io_lock);
- metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
- 1, pio->io_allocator, pio);
+ metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
+ pio->io_allocator, pio);
/*
* Call into the pipeline to see if there is more work that
@@ -4259,7 +4317,6 @@ zio_done(zio_t *zio)
*/
const uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
- ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa));
zio_link_t *zl = NULL;
/*
@@ -4278,7 +4335,8 @@ zio_done(zio_t *zio)
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
- ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
@@ -4290,9 +4348,11 @@ zio_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_bp != NULL);
+
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator);
- VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+ VERIFY(refcount_not_held(
+ &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
zio));
}