aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/metaslab.c508
-rw-r--r--module/zfs/spa.c37
-rw-r--r--module/zfs/spa_misc.c28
-rw-r--r--module/zfs/vdev.c4
-rw-r--r--module/zfs/vdev_queue.c11
-rw-r--r--module/zfs/vdev_removal.c2
-rw-r--r--module/zfs/zil.c2
-rw-r--r--module/zfs/zio.c88
8 files changed, 505 insertions, 175 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 879238e7d..c1e32884f 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
@@ -223,6 +223,8 @@ static void metaslab_set_fragmentation(metaslab_t *);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
#ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache;
#endif
@@ -243,7 +245,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
- refcount_create_tracked(&mc->mc_alloc_slots);
+ mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (refcount_t), KM_SLEEP);
+ mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (uint64_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ refcount_create_tracked(&mc->mc_alloc_slots[i]);
return (mc);
}
@@ -257,7 +264,12 @@ metaslab_class_destroy(metaslab_class_t *mc)
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
- refcount_destroy(&mc->mc_alloc_slots);
+ for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
+ refcount_destroy(&mc->mc_alloc_slots[i]);
+ kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
+ sizeof (refcount_t));
+ kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
+ sizeof (uint64_t));
mutex_destroy(&mc->mc_lock);
kmem_free(mc, sizeof (metaslab_class_t));
}
@@ -449,6 +461,30 @@ metaslab_compare(const void *x1, const void *x2)
const metaslab_t *m1 = (const metaslab_t *)x1;
const metaslab_t *m2 = (const metaslab_t *)x2;
+ int sort1 = 0;
+ int sort2 = 0;
+ if (m1->ms_allocator != -1 && m1->ms_primary)
+ sort1 = 1;
+ else if (m1->ms_allocator != -1 && !m1->ms_primary)
+ sort1 = 2;
+ if (m2->ms_allocator != -1 && m2->ms_primary)
+ sort2 = 1;
+ else if (m2->ms_allocator != -1 && !m2->ms_primary)
+ sort2 = 2;
+
+ /*
+ * Sort inactive metaslabs first, then primaries, then secondaries. When
+ * selecting a metaslab to allocate from, an allocator first tries its
+ * primary, then secondary active metaslab. If it doesn't have active
+ * metaslabs, or can't allocate from them, it searches for an inactive
+ * metaslab to activate. If it can't find a suitable one, it will steal
+ * a primary or secondary metaslab from another allocator.
+ */
+ if (sort1 < sort2)
+ return (-1);
+ if (sort1 > sort2)
+ return (1);
+
int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
if (likely(cmp))
return (cmp);
@@ -591,12 +627,16 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
}
metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
{
metaslab_group_t *mg;
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+ KM_SLEEP);
+ mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+ KM_SLEEP);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
mg->mg_vd = vd;
@@ -604,7 +644,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
mg->mg_activation_count = 0;
mg->mg_initialized = B_FALSE;
mg->mg_no_free_space = B_TRUE;
- refcount_create_tracked(&mg->mg_alloc_queue_depth);
+ mg->mg_allocators = allocators;
+
+ mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
+ KM_SLEEP);
+ mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
+ sizeof (uint64_t), KM_SLEEP);
+ for (int i = 0; i < allocators; i++) {
+ refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+ mg->mg_cur_max_alloc_queue_depth[i] = 0;
+ }
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
@@ -626,8 +675,20 @@ metaslab_group_destroy(metaslab_group_t *mg)
taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
+ kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
+ kmem_free(mg->mg_secondaries, mg->mg_allocators *
+ sizeof (metaslab_t *));
mutex_destroy(&mg->mg_lock);
- refcount_destroy(&mg->mg_alloc_queue_depth);
+
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+ mg->mg_cur_max_alloc_queue_depth[i] = 0;
+ }
+ kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
+ sizeof (refcount_t));
+ kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
+ sizeof (uint64_t));
+
kmem_free(mg, sizeof (metaslab_group_t));
}
@@ -706,6 +767,22 @@ metaslab_group_passivate(metaslab_group_t *mg)
taskq_wait_outstanding(mg->mg_taskq, 0);
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
metaslab_group_alloc_update(mg);
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ metaslab_t *msp = mg->mg_primaries[i];
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ msp = mg->mg_secondaries[i];
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ }
mgprev = mg->mg_prev;
mgnext = mg->mg_next;
@@ -846,6 +923,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
}
static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_lock));
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
{
/*
@@ -856,10 +944,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
ASSERT(MUTEX_HELD(&msp->ms_lock));
mutex_enter(&mg->mg_lock);
- ASSERT(msp->ms_group == mg);
- avl_remove(&mg->mg_metaslab_tree, msp);
- msp->ms_weight = weight;
- avl_add(&mg->mg_metaslab_tree, msp);
+ metaslab_group_sort_impl(mg, msp, weight);
mutex_exit(&mg->mg_lock);
}
@@ -907,7 +992,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
- uint64_t psize)
+ uint64_t psize, int allocator)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
@@ -936,7 +1021,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
if (mg->mg_allocatable) {
metaslab_group_t *mgp;
int64_t qdepth;
- uint64_t qmax = mg->mg_max_alloc_queue_depth;
+ uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
if (!mc->mc_alloc_throttle_enabled)
return (B_TRUE);
@@ -948,7 +1033,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
if (mg->mg_no_free_space)
return (B_FALSE);
- qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+ qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
/*
* If this metaslab group is below its qmax or it's
@@ -967,9 +1052,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* groups at the same time when we make this check.
*/
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
- qmax = mgp->mg_max_alloc_queue_depth;
+ qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
- qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+ qdepth = refcount_count(
+ &mgp->mg_alloc_queue_depth[allocator]);
/*
* If there is another metaslab group that
@@ -1389,6 +1475,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
ms->ms_size = 1ULL << vd->vdev_ms_shift;
+ ms->ms_allocator = -1;
+ ms->ms_new = B_TRUE;
/*
* We only open space map objects that already exist. All others
@@ -1485,6 +1573,7 @@ metaslab_fini(metaslab_t *msp)
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
mutex_destroy(&msp->ms_sync_lock);
+ ASSERT3U(msp->ms_allocator, ==, -1);
kmem_free(msp, sizeof (metaslab_t));
}
@@ -1880,19 +1969,59 @@ metaslab_weight(metaslab_t *msp)
}
static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ int allocator, uint64_t activation_weight)
+{
+ /*
+ * If we're activating for the claim code, we don't want to actually
+ * set the metaslab up for a specific allocator.
+ */
+ if (activation_weight == METASLAB_WEIGHT_CLAIM)
+ return (0);
+ metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+ mg->mg_primaries : mg->mg_secondaries);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ mutex_enter(&mg->mg_lock);
+ if (arr[allocator] != NULL) {
+ mutex_exit(&mg->mg_lock);
+ return (EEXIST);
+ }
+
+ arr[allocator] = msp;
+ ASSERT3S(msp->ms_allocator, ==, -1);
+ msp->ms_allocator = allocator;
+ msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+ mutex_exit(&mg->mg_lock);
+
+ return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int error = 0;
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
- int error = metaslab_load(msp);
- if (error) {
+ if ((error = metaslab_load(msp)) != 0) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
}
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ /*
+ * The metaslab was activated for another allocator
+ * while we were waiting, we should reselect.
+ */
+ return (EBUSY);
+ }
+ if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+ allocator, activation_weight)) != 0) {
+ return (error);
+ }
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort(msp->ms_group, msp,
@@ -1905,6 +2034,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
}
static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ metaslab_group_sort(mg, msp, weight);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT3P(msp->ms_group, ==, mg);
+ if (msp->ms_primary) {
+ ASSERT3U(0, <=, msp->ms_allocator);
+ ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+ ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ mg->mg_primaries[msp->ms_allocator] = NULL;
+ } else {
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+ mg->mg_secondaries[msp->ms_allocator] = NULL;
+ }
+ msp->ms_allocator = -1;
+ metaslab_group_sort_impl(mg, msp, weight);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
metaslab_passivate(metaslab_t *msp, uint64_t weight)
{
ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
@@ -1920,7 +2077,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
ASSERT0(weight & METASLAB_ACTIVE_MASK);
msp->ms_activation_weight = 0;
- metaslab_group_sort(msp->ms_group, msp, weight);
+ metaslab_passivate_allocator(msp->ms_group, msp, weight);
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
@@ -2477,11 +2634,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
+ if (msp->ms_new) {
+ msp->ms_new = B_FALSE;
+ mutex_enter(&mg->mg_lock);
+ mg->mg_ms_ready++;
+ mutex_exit(&mg->mg_lock);
+ }
/*
* Calculate the new weights before unloading any metaslabs.
* This will give us the most accurate weighting.
*/
- metaslab_group_sort(mg, msp, metaslab_weight(msp));
+ metaslab_group_sort(mg, msp, metaslab_weight(msp) |
+ (msp->ms_weight & METASLAB_ACTIVE_MASK));
/*
* If the metaslab is loaded and we've not tried to load or allocate
@@ -2494,6 +2658,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
VERIFY0(range_tree_space(
msp->ms_allocating[(txg + t) & TXG_MASK]));
}
+ if (msp->ms_allocator != -1) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
if (!metaslab_debug_unload)
metaslab_unload(msp);
@@ -2588,7 +2756,8 @@ metaslab_alloc_trace_fini(void)
*/
static void
metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
- metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+ metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+ int allocator)
{
metaslab_alloc_trace_t *mat;
@@ -2622,6 +2791,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
mat->mat_dva_id = dva_id;
mat->mat_offset = offset;
mat->mat_weight = 0;
+ mat->mat_allocator = allocator;
if (msp != NULL)
mat->mat_weight = msp->ms_weight;
@@ -2656,7 +2826,7 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
}
#else
-#define metaslab_trace_add(zal, mg, msp, psize, id, off)
+#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
void
metaslab_alloc_trace_init(void)
@@ -2687,35 +2857,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
*/
static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
- flags & METASLAB_DONT_THROTTLE)
+ (flags & METASLAB_DONT_THROTTLE))
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
- (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+ (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+ uint64_t max = mg->mg_max_alloc_queue_depth;
+ uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+ while (cur < max) {
+ if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+ cur, cur + 1) == cur) {
+ atomic_inc_64(
+ &mg->mg_class->mc_alloc_max_slots[allocator]);
+ return;
+ }
+ cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+ }
}
void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator, boolean_t io_complete)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
- flags & METASLAB_DONT_THROTTLE)
+ (flags & METASLAB_DONT_THROTTLE))
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
- (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+ (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+ if (io_complete)
+ metaslab_group_increment_qdepth(mg, allocator);
}
void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+ int allocator)
{
#ifdef ZFS_DEBUG
const dva_t *dva = bp->blk_dva;
@@ -2724,7 +2915,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
for (int d = 0; d < ndvas; d++) {
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
- VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+ VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
+ tag));
}
#endif
}
@@ -2766,91 +2958,146 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
return (start);
}
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried. In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab. This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+ dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+ zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+{
+ avl_index_t idx;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ metaslab_t *msp = avl_find(t, search, &idx);
+ if (msp == NULL)
+ msp = avl_nearest(t, idx, AVL_AFTER);
+
+ for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+ int i;
+ if (!metaslab_should_allocate(msp, asize)) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL, allocator);
+ continue;
+ }
+
+ /*
+ * If the selected metaslab is condensing, skip it.
+ */
+ if (msp->ms_condensing)
+ continue;
+
+ *was_active = msp->ms_allocator != -1;
+ /*
+ * If we're activating as primary, this is our first allocation
+ * from this disk, so we don't need to check how close we are.
+ * If the metaslab under consideration was already active,
+ * we're getting desperate enough to steal another allocator's
+ * metaslab, so we still don't care about distances.
+ */
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+ break;
+
+ uint64_t target_distance = min_distance
+ + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+ min_distance >> 1);
+
+ for (i = 0; i < d; i++) {
+ if (metaslab_distance(msp, &dva[i]) < target_distance)
+ break;
+ }
+ if (i == d)
+ break;
+ }
+
+ if (msp != NULL) {
+ search->ms_weight = msp->ms_weight;
+ search->ms_start = msp->ms_start + 1;
+ search->ms_allocator = msp->ms_allocator;
+ search->ms_primary = msp->ms_primary;
+ }
+ return (msp);
+}
+
+/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+ uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+ int allocator)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
uint64_t activation_weight;
- uint64_t target_distance;
- int i;
+ boolean_t tertiary = B_FALSE;
activation_weight = METASLAB_WEIGHT_PRIMARY;
- for (i = 0; i < d; i++) {
- if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ for (int i = 0; i < d; i++) {
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ tertiary = B_TRUE;
break;
}
}
+ /*
+ * If we don't have enough metaslabs active to fill the entire array, we
+ * just use the 0th slot.
+ */
+ if (mg->mg_ms_ready < mg->mg_allocators * 2) {
+ tertiary = B_FALSE;
+ allocator = 0;
+ }
+
+ ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
search->ms_weight = UINT64_MAX;
search->ms_start = 0;
+ /*
+ * At the end of the metaslab tree are the already-active metaslabs,
+ * first the primaries, then the secondaries. When we resume searching
+ * through the tree, we need to consider ms_allocator and ms_primary so
+ * we start in the location right after where we left off, and don't
+ * accidentally loop forever considering the same metaslabs.
+ */
+ search->ms_allocator = -1;
+ search->ms_primary = B_TRUE;
for (;;) {
- boolean_t was_active;
- avl_tree_t *t = &mg->mg_metaslab_tree;
- avl_index_t idx;
+ boolean_t was_active = B_FALSE;
mutex_enter(&mg->mg_lock);
- /*
- * Find the metaslab with the highest weight that is less
- * than what we've already tried. In the common case, this
- * means that we will examine each metaslab at most once.
- * Note that concurrent callers could reorder metaslabs
- * by activation/passivation once we have dropped the mg_lock.
- * If a metaslab is activated by another thread, and we fail
- * to allocate from the metaslab we have selected, we may
- * not try the newly-activated metaslab, and instead activate
- * another metaslab. This is not optimal, but generally
- * does not cause any problems (a possible exception being
- * if every metaslab is completely full except for the
- * the newly-activated metaslab which we fail to examine).
- */
- msp = avl_find(t, search, &idx);
- if (msp == NULL)
- msp = avl_nearest(t, idx, AVL_AFTER);
- for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
-
- if (!metaslab_should_allocate(msp, asize)) {
- metaslab_trace_add(zal, mg, msp, asize, d,
- TRACE_TOO_SMALL);
- continue;
- }
-
- /*
- * If the selected metaslab is condensing, skip it.
- */
- if (msp->ms_condensing)
- continue;
-
- was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
- if (activation_weight == METASLAB_WEIGHT_PRIMARY)
- break;
-
- target_distance = min_distance +
- (space_map_allocated(msp->ms_sm) != 0 ? 0 :
- min_distance >> 1);
-
- for (i = 0; i < d; i++) {
- if (metaslab_distance(msp, &dva[i]) <
- target_distance)
- break;
- }
- if (i == d)
- break;
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ mg->mg_primaries[allocator] != NULL) {
+ msp = mg->mg_primaries[allocator];
+ was_active = B_TRUE;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ mg->mg_secondaries[allocator] != NULL && !tertiary) {
+ msp = mg->mg_secondaries[allocator];
+ was_active = B_TRUE;
+ } else {
+ msp = find_valid_metaslab(mg, activation_weight, dva, d,
+ min_distance, asize, allocator, zal, search,
+ &was_active);
}
+
mutex_exit(&mg->mg_lock);
if (msp == NULL) {
kmem_free(search, sizeof (*search));
return (-1ULL);
}
- search->ms_weight = msp->ms_weight;
- search->ms_start = msp->ms_start + 1;
mutex_enter(&msp->ms_lock);
-
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
@@ -2864,18 +3111,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
continue;
}
- if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
- activation_weight == METASLAB_WEIGHT_PRIMARY) {
- metaslab_passivate(msp,
- msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+ /*
+ * If the metaslab is freshly activated for an allocator that
+ * isn't the one we're allocating from, or if it's a primary and
+ * we're seeking a secondary (or vice versa), we go back and
+ * select a new metaslab.
+ */
+ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ (msp->ms_allocator != -1) &&
+ (msp->ms_allocator != allocator || ((activation_weight ==
+ METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_WEIGHT_CLAIM);
mutex_exit(&msp->ms_lock);
continue;
}
- if (metaslab_activate(msp, activation_weight) != 0) {
+ if (metaslab_activate(msp, allocator, activation_weight) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
+
msp->ms_selected_txg = txg;
/*
@@ -2888,7 +3149,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
if (!metaslab_should_allocate(msp, asize)) {
/* Passivate this metaslab and select a new one. */
metaslab_trace_add(zal, mg, msp, asize, d,
- TRACE_TOO_SMALL);
+ TRACE_TOO_SMALL, allocator);
goto next;
}
@@ -2900,13 +3161,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
*/
if (msp->ms_condensing) {
metaslab_trace_add(zal, mg, msp, asize, d,
- TRACE_CONDENSING);
+ TRACE_CONDENSING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
continue;
}
offset = metaslab_block_alloc(msp, asize, txg);
- metaslab_trace_add(zal, mg, msp, asize, d, offset);
+ metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
if (offset != -1ULL) {
/* Proactively passivate the metaslab, if needed */
@@ -2962,19 +3225,20 @@ next:
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+ uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+ int allocator)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
- min_distance, dva, d);
+ min_distance, dva, d, allocator);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
mg->mg_failed_allocations++;
metaslab_trace_add(zal, mg, NULL, asize, d,
- TRACE_GROUP_FAILURE);
+ TRACE_GROUP_FAILURE, allocator);
if (asize == SPA_GANGBLOCKSIZE) {
/*
* This metaslab group was unable to allocate
@@ -3009,7 +3273,7 @@ int ditto_same_vdev_distance_shift = 3;
int
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
- zio_alloc_list_t *zal)
+ zio_alloc_list_t *zal, int allocator)
{
metaslab_group_t *mg, *fast_mg, *rotor;
vdev_t *vd;
@@ -3021,7 +3285,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* For testing, make some blocks above a certain size be gang blocks.
*/
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
- metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
+ metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+ allocator);
return (SET_ERROR(ENOSPC));
}
@@ -3116,12 +3381,12 @@ top:
*/
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
allocatable = metaslab_group_allocatable(mg, rotor,
- psize);
+ psize, allocator);
}
if (!allocatable) {
metaslab_trace_add(zal, mg, NULL, psize, d,
- TRACE_NOT_ALLOCATABLE);
+ TRACE_NOT_ALLOCATABLE, allocator);
goto next;
}
@@ -3136,7 +3401,7 @@ top:
vd->vdev_state < VDEV_STATE_HEALTHY) &&
d == 0 && !try_hard && vd->vdev_children == 0) {
metaslab_trace_add(zal, mg, NULL, psize, d,
- TRACE_VDEV_ERROR);
+ TRACE_VDEV_ERROR, allocator);
goto next;
}
@@ -3160,7 +3425,7 @@ top:
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
- distance, dva, d);
+ distance, dva, d, allocator);
if (offset != -1ULL) {
/*
@@ -3244,7 +3509,7 @@ next:
bzero(&dva[d], sizeof (dva_t));
- metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
+ metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
return (SET_ERROR(ENOSPC));
}
@@ -3545,18 +3810,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
* the reservation.
*/
boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
- int flags)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+ zio_t *zio, int flags)
{
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
+ uint64_t max = mc->mc_alloc_max_slots[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
- uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
- if (reserved_slots < mc->mc_alloc_max_slots)
- available_slots = mc->mc_alloc_max_slots - reserved_slots;
+ uint64_t reserved_slots =
+ refcount_count(&mc->mc_alloc_slots[allocator]);
+ if (reserved_slots < max)
+ available_slots = max - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
/*
@@ -3564,7 +3831,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++) {
- reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+ reserved_slots =
+ refcount_add(&mc->mc_alloc_slots[allocator],
+ zio);
}
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
@@ -3575,12 +3844,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
}
void
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+ int allocator, zio_t *zio)
{
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++) {
- (void) refcount_remove(&mc->mc_alloc_slots, zio);
+ (void) refcount_remove(&mc->mc_alloc_slots[allocator],
+ zio);
}
mutex_exit(&mc->mc_lock);
}
@@ -3602,7 +3873,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
mutex_enter(&msp->ms_lock);
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
if (error == 0 &&
!range_tree_contains(msp->ms_allocatable, offset, size))
@@ -3707,7 +3978,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
- zio_alloc_list_t *zal, zio_t *zio)
+ zio_alloc_list_t *zal, zio_t *zio, int allocator)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
@@ -3730,12 +4001,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
- txg, flags, zal);
+ txg, flags, zal, allocator);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
metaslab_group_alloc_decrement(spa,
- DVA_GET_VDEV(&dva[d]), zio, flags);
+ DVA_GET_VDEV(&dva[d]), zio, flags,
+ allocator, B_FALSE);
bzero(&dva[d], sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -3746,7 +4018,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
* based on the newly allocated dva.
*/
metaslab_group_alloc_increment(spa,
- DVA_GET_VDEV(&dva[d]), zio, flags);
+ DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
}
}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 8ab7c3428..537e19068 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -7652,9 +7652,11 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
- mutex_enter(&spa->spa_alloc_lock);
- VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
- mutex_exit(&spa->spa_alloc_lock);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
/*
* If there are any pending vdev state changes, convert them
@@ -7715,7 +7717,7 @@ spa_sync(spa_t *spa, uint64_t txg)
* The max queue depth will not change in the middle of syncing
* out this txg.
*/
- uint64_t queue_depth_total = 0;
+ uint64_t slots_per_allocator = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
@@ -7729,18 +7731,23 @@ spa_sync(spa_t *spa, uint64_t txg)
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
- ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
mg->mg_max_alloc_queue_depth = max_queue_depth;
- queue_depth_total += mg->mg_max_alloc_queue_depth;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mg->mg_cur_max_alloc_queue_depth[i] =
+ zfs_vdev_def_queue_depth;
+ }
+ slots_per_allocator += zfs_vdev_def_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
- ASSERT0(refcount_count(&mc->mc_alloc_slots));
- mc->mc_alloc_max_slots = queue_depth_total;
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
+ mc->mc_alloc_max_slots[i] = slots_per_allocator;
+ }
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
- ASSERT3U(mc->mc_alloc_max_slots, <=,
- max_queue_depth * rvd->vdev_children);
-
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_indirect_state_sync_verify(vd);
@@ -7920,9 +7927,11 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);
- mutex_enter(&spa->spa_alloc_lock);
- VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
- mutex_exit(&spa->spa_alloc_lock);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
/*
* Update usable space statistics.
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index f43a38ef1..44ceb42d4 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -375,6 +375,8 @@ int spa_asize_inflation = 24;
*/
int spa_slop_shift = 5;
uint64_t spa_min_slop = 128 * 1024 * 1024;
+int spa_allocators = 4;
+
/*PRINTFLIKE2*/
void
@@ -624,7 +626,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -660,8 +661,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
if (altroot)
spa->spa_root = spa_strdup(altroot);
- avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
- sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ spa->spa_alloc_count = spa_allocators;
+ spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (kmutex_t), KM_SLEEP);
+ spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (avl_tree_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ }
/*
* Every pool starts with the default cachefile
@@ -740,7 +749,15 @@ spa_remove(spa_t *spa)
kmem_free(dp, sizeof (spa_config_dirent_t));
}
- avl_destroy(&spa->spa_alloc_tree);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ avl_destroy(&spa->spa_alloc_trees[i]);
+ mutex_destroy(&spa->spa_alloc_locks[i]);
+ }
+ kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+ sizeof (kmutex_t));
+ kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+ sizeof (avl_tree_t));
+
list_destroy(&spa->spa_config_list);
nvlist_free(spa->spa_label_features);
@@ -764,7 +781,6 @@ spa_remove(spa_t *spa)
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
- mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index c35f73923..00e1fbfa2 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -709,7 +709,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
vd->vdev_mg = metaslab_group_create(islog ?
- spa_log_class(spa) : spa_normal_class(spa), vd);
+ spa_log_class(spa) : spa_normal_class(spa), vd,
+ spa->spa_alloc_count);
}
if (vd->vdev_ops->vdev_op_leaf &&
@@ -1145,7 +1146,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
vd->vdev_ms = mspp;
vd->vdev_ms_count = newc;
-
for (m = oldc; m < newc; m++) {
uint64_t object = 0;
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 75a123ece..30a883f85 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -191,6 +191,15 @@ int zfs_vdev_queue_depth_pct = 1000;
int zfs_vdev_queue_depth_pct = 300;
#endif
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
+
int
vdev_queue_offset_compare(const void *x1, const void *x2)
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index f2bdd6389..dcce93c70 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -945,7 +945,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
ASSERT3U(size, <=, maxalloc);
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal);
+ &dst, 0, NULL, txg, 0, zal, 0);
if (error != 0)
return (error);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index f7c793d40..8b7aeb5c3 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2018 Datto Inc.
*/
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 9a98d4fc0..565a78c89 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
@@ -45,6 +45,7 @@
#include <sys/trace_zio.h>
#include <sys/abd.h>
#include <sys/dsl_crypt.h>
+#include <sys/cityhash.h>
/*
* ==========================================================================
@@ -2611,7 +2612,8 @@ zio_write_gang_block(zio_t *pio)
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
flags |= METASLAB_ASYNC_ALLOC;
- VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+ VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
+ pio));
/*
* The logical zio has already placed a reservation for
@@ -2622,12 +2624,12 @@ zio_write_gang_block(zio_t *pio)
* additional reservations for gang blocks.
*/
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
- pio, flags));
+ pio->io_allocator, pio, flags));
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
- &pio->io_alloc_list, pio);
+ &pio->io_alloc_list, pio, pio->io_allocator);
if (error) {
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2641,7 +2643,7 @@ zio_write_gang_block(zio_t *pio)
* stage.
*/
metaslab_class_throttle_unreserve(mc,
- gbh_copies - copies, pio);
+ gbh_copies - copies, pio->io_allocator, pio);
}
pio->io_error = error;
@@ -2705,7 +2707,7 @@ zio_write_gang_block(zio_t *pio)
* slot for them here.
*/
VERIFY(metaslab_class_throttle_reserve(mc,
- zp.zp_copies, cio, flags));
+ zp.zp_copies, cio->io_allocator, cio, flags));
}
zio_nowait(cio);
}
@@ -3223,13 +3225,13 @@ zio_ddt_free(zio_t *zio)
*/
static zio_t *
-zio_io_to_allocate(spa_t *spa)
+zio_io_to_allocate(spa_t *spa, int allocator)
{
zio_t *zio;
- ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
- zio = avl_first(&spa->spa_alloc_tree);
+ zio = avl_first(&spa->spa_alloc_trees[allocator]);
if (zio == NULL)
return (NULL);
@@ -3239,12 +3241,13 @@ zio_io_to_allocate(spa_t *spa)
* Try to place a reservation for this zio. If we're unable to
* reserve then we throttle.
*/
+ ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
- zio->io_prop.zp_copies, zio, 0)) {
+ zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
return (NULL);
}
- avl_remove(&spa->spa_alloc_tree, zio);
+ avl_remove(&spa->spa_alloc_trees[allocator], zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio);
@@ -3268,13 +3271,23 @@ zio_dva_throttle(zio_t *zio)
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
- mutex_enter(&spa->spa_alloc_lock);
+ zbookmark_phys_t *bm = &zio->io_bookmark;
+ /*
+ * We want to try to use as many allocators as possible to help improve
+ * performance, but we also want logically adjacent IOs to be physically
+ * adjacent to improve sequential read performance. We chunk each object
+ * into 2^20 block regions, and then hash based on the objset, object,
+ * level, and region to accomplish both of these goals.
+ */
+ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+ bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+ mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
- avl_add(&spa->spa_alloc_tree, zio);
+ avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
- nio = zio_io_to_allocate(zio->io_spa);
- mutex_exit(&spa->spa_alloc_lock);
+ nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+ mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
if (nio == zio)
return (ZIO_PIPELINE_CONTINUE);
@@ -3295,13 +3308,13 @@ zio_dva_throttle(zio_t *zio)
}
void
-zio_allocate_dispatch(spa_t *spa)
+zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
- mutex_enter(&spa->spa_alloc_lock);
- zio = zio_io_to_allocate(spa);
- mutex_exit(&spa->spa_alloc_lock);
+ mutex_enter(&spa->spa_alloc_locks[allocator]);
+ zio = zio_io_to_allocate(spa, allocator);
+ mutex_exit(&spa->spa_alloc_locks[allocator]);
if (zio == NULL)
return;
@@ -3340,7 +3353,7 @@ zio_dva_allocate(zio_t *zio)
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
- &zio->io_alloc_list, zio);
+ &zio->io_alloc_list, zio, zio->io_allocator);
if (error != 0) {
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
@@ -3409,14 +3422,23 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
+ /*
+ * When allocating a zil block, we don't have information about
+ * the final destination of the block except the objset it's part
+ * of, so we just hash the objset ID to pick the allocator to get
+ * some parallelism.
+ */
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
- txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL);
+ txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL,
+ cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
+ spa->spa_alloc_count);
if (error == 0) {
*slog = TRUE;
} else {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
- &io_alloc_list, NULL);
+ &io_alloc_list, NULL, cityhash4(0, 0, 0,
+ os->os_dsl_dataset->ds_object) % spa->spa_alloc_count);
if (error == 0)
*slog = FALSE;
}
@@ -4119,8 +4141,8 @@ zio_ready(zio_t *zio)
*/
metaslab_class_throttle_unreserve(
spa_normal_class(zio->io_spa),
- zio->io_prop.zp_copies, zio);
- zio_allocate_dispatch(zio->io_spa);
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
}
}
@@ -4204,18 +4226,19 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
mutex_enter(&pio->io_lock);
- metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+ pio->io_allocator, B_TRUE);
mutex_exit(&pio->io_lock);
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
- 1, pio);
+ 1, pio->io_allocator, pio);
/*
* Call into the pipeline to see if there is more work that
* needs to be done. If there is work to be done it will be
* dispatched to another taskq thread.
*/
- zio_allocate_dispatch(zio->io_spa);
+ zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
}
static int
@@ -4227,6 +4250,7 @@ zio_done(zio_t *zio)
*/
const uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
+ ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa));
zio_link_t *zl = NULL;
/*
@@ -4245,8 +4269,7 @@ zio_done(zio_t *zio)
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
- ASSERT(spa_normal_class(
- zio->io_spa)->mc_alloc_throttle_enabled);
+ ASSERT(mc->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
@@ -4258,9 +4281,10 @@ zio_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_bp != NULL);
- metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
- VERIFY(refcount_not_held(
- &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
+ metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
+ zio->io_allocator);
+ VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+ zio));
}