diff options
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/metaslab.c | 508 | ||||
-rw-r--r-- | module/zfs/spa.c | 37 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 28 | ||||
-rw-r--r-- | module/zfs/vdev.c | 4 | ||||
-rw-r--r-- | module/zfs/vdev_queue.c | 11 | ||||
-rw-r--r-- | module/zfs/vdev_removal.c | 2 | ||||
-rw-r--r-- | module/zfs/zil.c | 2 | ||||
-rw-r--r-- | module/zfs/zio.c | 88 |
8 files changed, 505 insertions, 175 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 879238e7d..c1e32884f 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -223,6 +223,8 @@ static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); +static void metaslab_passivate(metaslab_t *msp, uint64_t weight); +static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); #ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; #endif @@ -243,7 +245,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); - refcount_create_tracked(&mc->mc_alloc_slots); + mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (refcount_t), KM_SLEEP); + mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) + refcount_create_tracked(&mc->mc_alloc_slots[i]); return (mc); } @@ -257,7 +264,12 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); - refcount_destroy(&mc->mc_alloc_slots); + for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) + refcount_destroy(&mc->mc_alloc_slots[i]); + kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * + sizeof (refcount_t)); + kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * + sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -449,6 +461,30 @@ metaslab_compare(const void *x1, const void *x2) const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; + int sort1 = 0; + int sort2 = 0; + if (m1->ms_allocator != -1 && m1->ms_primary) + sort1 = 1; + else if (m1->ms_allocator != -1 && !m1->ms_primary) + sort1 = 2; + if (m2->ms_allocator != -1 && m2->ms_primary) + sort2 = 1; + else if (m2->ms_allocator != -1 && !m2->ms_primary) + sort2 = 2; + + /* + * Sort inactive metaslabs first, then primaries, then secondaries. When + * selecting a metaslab to allocate from, an allocator first tries its + * primary, then secondary active metaslab. If it doesn't have active + * metaslabs, or can't allocate from them, it searches for an inactive + * metaslab to activate. If it can't find a suitable one, it will steal + * a primary or secondary metaslab from another allocator. + */ + if (sort1 < sort2) + return (-1); + if (sort1 > sort2) + return (1); + int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); @@ -591,12 +627,16 @@ metaslab_group_alloc_update(metaslab_group_t *mg) } metaslab_group_t * -metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); + mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); mg->mg_vd = vd; @@ -604,7 +644,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; - refcount_create_tracked(&mg->mg_alloc_queue_depth); + mg->mg_allocators = allocators; + + mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), + KM_SLEEP); + mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < allocators; i++) { + refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); @@ -626,8 +675,20 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); + kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); + kmem_free(mg->mg_secondaries, mg->mg_allocators * + sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); - refcount_destroy(&mg->mg_alloc_queue_depth); + + for (int i = 0; i < mg->mg_allocators; i++) { + refcount_destroy(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } + kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * + sizeof (refcount_t)); + kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * + sizeof (uint64_t)); + kmem_free(mg, sizeof (metaslab_group_t)); } @@ -706,6 +767,22 @@ metaslab_group_passivate(metaslab_group_t *mg) taskq_wait_outstanding(mg->mg_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); + for (int i = 0; i < mg->mg_allocators; i++) { + metaslab_t *msp = mg->mg_primaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + msp = mg->mg_secondaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + } mgprev = mg->mg_prev; mgnext = mg->mg_next; @@ -846,6 +923,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) } static void +metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + ASSERT(MUTEX_HELD(&mg->mg_lock)); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + +} + +static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* @@ -856,10 +944,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_weight = weight; - avl_add(&mg->mg_metaslab_tree, msp); + metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } @@ -907,7 +992,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg) */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, - uint64_t psize) + uint64_t psize, int allocator) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; @@ -936,7 +1021,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, if (mg->mg_allocatable) { metaslab_group_t *mgp; int64_t qdepth; - uint64_t qmax = mg->mg_max_alloc_queue_depth; + uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); @@ -948,7 +1033,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, if (mg->mg_no_free_space) return (B_FALSE); - qdepth = refcount_count(&mg->mg_alloc_queue_depth); + qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); /* * If this metaslab group is below its qmax or it's @@ -967,9 +1052,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * groups at the same time when we make this check. */ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { - qmax = mgp->mg_max_alloc_queue_depth; + qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; - qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + qdepth = refcount_count( + &mgp->mg_alloc_queue_depth[allocator]); /* * If there is another metaslab group that @@ -1389,6 +1475,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; + ms->ms_allocator = -1; + ms->ms_new = B_TRUE; /* * We only open space map objects that already exist. All others @@ -1485,6 +1573,7 @@ metaslab_fini(metaslab_t *msp) cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); + ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } @@ -1880,19 +1969,59 @@ metaslab_weight(metaslab_t *msp) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, + int allocator, uint64_t activation_weight) +{ + /* + * If we're activating for the claim code, we don't want to actually + * set the metaslab up for a specific allocator. + */ + if (activation_weight == METASLAB_WEIGHT_CLAIM) + return (0); + metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? + mg->mg_primaries : mg->mg_secondaries); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&mg->mg_lock); + if (arr[allocator] != NULL) { + mutex_exit(&mg->mg_lock); + return (EEXIST); + } + + arr[allocator] = msp; + ASSERT3S(msp->ms_allocator, ==, -1); + msp->ms_allocator = allocator; + msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + mutex_exit(&mg->mg_lock); + + return (0); +} + +static int +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int error = 0; metaslab_load_wait(msp); if (!msp->ms_loaded) { - int error = metaslab_load(msp); - if (error) { + if ((error = metaslab_load(msp)) != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } } + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + /* + * The metaslab was activated for another allocator + * while we were waiting, we should reselect. + */ + return (EBUSY); + } + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, @@ -1905,6 +2034,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) } static void +metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_group_sort(mg, msp, weight); + return; + } + + mutex_enter(&mg->mg_lock); + ASSERT3P(msp->ms_group, ==, mg); + if (msp->ms_primary) { + ASSERT3U(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + mg->mg_primaries[msp->ms_allocator] = NULL; + } else { + ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); + mg->mg_secondaries[msp->ms_allocator] = NULL; + } + msp->ms_allocator = -1; + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + +static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE); @@ -1920,7 +2077,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; - metaslab_group_sort(msp->ms_group, msp, weight); + metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } @@ -2477,11 +2634,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } + if (msp->ms_new) { + msp->ms_new = B_FALSE; + mutex_enter(&mg->mg_lock); + mg->mg_ms_ready++; + mutex_exit(&mg->mg_lock); + } /* * Calculate the new weights before unloading any metaslabs. * This will give us the most accurate weighting. */ - metaslab_group_sort(mg, msp, metaslab_weight(msp)); + metaslab_group_sort(mg, msp, metaslab_weight(msp) | + (msp->ms_weight & METASLAB_ACTIVE_MASK)); /* * If the metaslab is loaded and we've not tried to load or allocate @@ -2494,6 +2658,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } if (!metaslab_debug_unload) metaslab_unload(msp); @@ -2588,7 +2756,8 @@ metaslab_alloc_trace_fini(void) */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, - metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, + int allocator) { metaslab_alloc_trace_t *mat; @@ -2622,6 +2791,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; + mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; @@ -2656,7 +2826,7 @@ metaslab_trace_fini(zio_alloc_list_t *zal) } #else -#define metaslab_trace_add(zal, mg, msp, psize, id, off) +#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) void metaslab_alloc_trace_init(void) @@ -2687,35 +2857,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal) */ static void -metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || - flags & METASLAB_DONT_THROTTLE) + (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) refcount_add(&mg->mg_alloc_queue_depth, tag); + (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); +} + +static void +metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) +{ + uint64_t max = mg->mg_max_alloc_queue_depth; + uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + while (cur < max) { + if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], + cur, cur + 1) == cur) { + atomic_inc_64( + &mg->mg_class->mc_alloc_max_slots[allocator]); + return; + } + cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + } } void -metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || - flags & METASLAB_DONT_THROTTLE) + (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); + (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); + if (io_complete) + metaslab_group_increment_qdepth(mg, allocator); } void -metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, + int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; @@ -2724,7 +2915,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], + tag)); } #endif } @@ -2766,91 +2958,146 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) return (start); } +/* + * Find the metaslab with the highest weight that is less than what we've + * already tried. In the common case, this means that we will examine each + * metaslab at most once. Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. If a metaslab is + * activated by another thread, and we fail to allocate from the metaslab we + * have selected, we may not try the newly-activated metaslab, and instead + * activate another metaslab. This is not optimal, but generally does not cause + * any problems (a possible exception being if every metaslab is completely full + * except for the the newly-activated metaslab which we fail to examine). + */ +static metaslab_t * +find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, + dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, + zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) +{ + avl_index_t idx; + avl_tree_t *t = &mg->mg_metaslab_tree; + metaslab_t *msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + int i; + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + continue; + } + + /* + * If the selected metaslab is condensing, skip it. + */ + if (msp->ms_condensing) + continue; + + *was_active = msp->ms_allocator != -1; + /* + * If we're activating as primary, this is our first allocation + * from this disk, so we don't need to check how close we are. + * If the metaslab under consideration was already active, + * we're getting desperate enough to steal another allocator's + * metaslab, so we still don't care about distances. + */ + if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) + break; + + uint64_t target_distance = min_distance + + (space_map_allocated(msp->ms_sm) != 0 ? 0 : + min_distance >> 1); + + for (i = 0; i < d; i++) { + if (metaslab_distance(msp, &dva[i]) < target_distance) + break; + } + if (i == d) + break; + } + + if (msp != NULL) { + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; + search->ms_allocator = msp->ms_allocator; + search->ms_primary = msp->ms_primary; + } + return (msp); +} + +/* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight; - uint64_t target_distance; - int i; + boolean_t tertiary = B_FALSE; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) { - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + for (int i = 0; i < d; i++) { + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + tertiary = B_TRUE; break; } } + /* + * If we don't have enough metaslabs active to fill the entire array, we + * just use the 0th slot. + */ + if (mg->mg_ms_ready < mg->mg_allocators * 2) { + tertiary = B_FALSE; + allocator = 0; + } + + ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; + /* + * At the end of the metaslab tree are the already-active metaslabs, + * first the primaries, then the secondaries. When we resume searching + * through the tree, we need to consider ms_allocator and ms_primary so + * we start in the location right after where we left off, and don't + * accidentally loop forever considering the same metaslabs. + */ + search->ms_allocator = -1; + search->ms_primary = B_TRUE; for (;;) { - boolean_t was_active; - avl_tree_t *t = &mg->mg_metaslab_tree; - avl_index_t idx; + boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); - /* - * Find the metaslab with the highest weight that is less - * than what we've already tried. In the common case, this - * means that we will examine each metaslab at most once. - * Note that concurrent callers could reorder metaslabs - * by activation/passivation once we have dropped the mg_lock. - * If a metaslab is activated by another thread, and we fail - * to allocate from the metaslab we have selected, we may - * not try the newly-activated metaslab, and instead activate - * another metaslab. This is not optimal, but generally - * does not cause any problems (a possible exception being - * if every metaslab is completely full except for the - * the newly-activated metaslab which we fail to examine). - */ - msp = avl_find(t, search, &idx); - if (msp == NULL) - msp = avl_nearest(t, idx, AVL_AFTER); - for (; msp != NULL; msp = AVL_NEXT(t, msp)) { - - if (!metaslab_should_allocate(msp, asize)) { - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL); - continue; - } - - /* - * If the selected metaslab is condensing, skip it. - */ - if (msp->ms_condensing) - continue; - - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - if (activation_weight == METASLAB_WEIGHT_PRIMARY) - break; - - target_distance = min_distance + - (space_map_allocated(msp->ms_sm) != 0 ? 0 : - min_distance >> 1); - - for (i = 0; i < d; i++) { - if (metaslab_distance(msp, &dva[i]) < - target_distance) - break; - } - if (i == d) - break; + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + mg->mg_primaries[allocator] != NULL) { + msp = mg->mg_primaries[allocator]; + was_active = B_TRUE; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + mg->mg_secondaries[allocator] != NULL && !tertiary) { + msp = mg->mg_secondaries[allocator]; + was_active = B_TRUE; + } else { + msp = find_valid_metaslab(mg, activation_weight, dva, d, + min_distance, asize, allocator, zal, search, + &was_active); } + mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } - search->ms_weight = msp->ms_weight; - search->ms_start = msp->ms_start + 1; mutex_enter(&msp->ms_lock); - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that @@ -2864,18 +3111,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } - if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && - activation_weight == METASLAB_WEIGHT_PRIMARY) { - metaslab_passivate(msp, - msp->ms_weight & ~METASLAB_ACTIVE_MASK); + /* + * If the metaslab is freshly activated for an allocator that + * isn't the one we're allocating from, or if it's a primary and + * we're seeking a secondary (or vice versa), we go back and + * select a new metaslab. + */ + if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && + (msp->ms_allocator != -1) && + (msp->ms_allocator != allocator || ((activation_weight == + METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { + mutex_exit(&msp->ms_lock); + continue; + } + + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } - if (metaslab_activate(msp, activation_weight) != 0) { + if (metaslab_activate(msp, allocator, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } + msp->ms_selected_txg = txg; /* @@ -2888,7 +3149,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (!metaslab_should_allocate(msp, asize)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL); + TRACE_TOO_SMALL, allocator); goto next; } @@ -2900,13 +3161,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_CONDENSING); + TRACE_CONDENSING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset); + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ @@ -2962,19 +3225,20 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, - min_distance, dva, d); + min_distance, dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, - TRACE_GROUP_FAILURE); + TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate @@ -3009,7 +3273,7 @@ int ditto_same_vdev_distance_shift = 3; int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal) + zio_alloc_list_t *zal, int allocator) { metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; @@ -3021,7 +3285,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * For testing, make some blocks above a certain size be gang blocks. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { - metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, + allocator); return (SET_ERROR(ENOSPC)); } @@ -3116,12 +3381,12 @@ top: */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, - psize); + psize, allocator); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_NOT_ALLOCATABLE); + TRACE_NOT_ALLOCATABLE, allocator); goto next; } @@ -3136,7 +3401,7 @@ top: vd->vdev_state < VDEV_STATE_HEALTHY) && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_VDEV_ERROR); + TRACE_VDEV_ERROR, allocator); goto next; } @@ -3160,7 +3425,7 @@ top: ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - distance, dva, d); + distance, dva, d, allocator); if (offset != -1ULL) { /* @@ -3244,7 +3509,7 @@ next: bzero(&dva[d], sizeof (dva_t)); - metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } @@ -3545,18 +3810,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) * the reservation. */ boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, - int flags) +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, + zio_t *zio, int flags) { uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; + uint64_t max = mc->mc_alloc_max_slots[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); - if (reserved_slots < mc->mc_alloc_max_slots) - available_slots = mc->mc_alloc_max_slots - reserved_slots; + uint64_t reserved_slots = + refcount_count(&mc->mc_alloc_slots[allocator]); + if (reserved_slots < max) + available_slots = max - reserved_slots; if (slots <= available_slots || GANG_ALLOCATION(flags)) { /* @@ -3564,7 +3831,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, * them individually when an I/O completes. */ for (int d = 0; d < slots; d++) { - reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + reserved_slots = + refcount_add(&mc->mc_alloc_slots[allocator], + zio); } zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; @@ -3575,12 +3844,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, } void -metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, + int allocator, zio_t *zio) { ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) { - (void) refcount_remove(&mc->mc_alloc_slots, zio); + (void) refcount_remove(&mc->mc_alloc_slots[allocator], + zio); } mutex_exit(&mc->mc_lock); } @@ -3602,7 +3873,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) @@ -3707,7 +3978,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, - zio_alloc_list_t *zal, zio_t *zio) + zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -3730,12 +4001,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal); + txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, - DVA_GET_VDEV(&dva[d]), zio, flags); + DVA_GET_VDEV(&dva[d]), zio, flags, + allocator, B_FALSE); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -3746,7 +4018,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, - DVA_GET_VDEV(&dva[d]), zio, flags); + DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 8ab7c3428..537e19068 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7652,9 +7652,11 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * If there are any pending vdev state changes, convert them @@ -7715,7 +7717,7 @@ spa_sync(spa_t *spa, uint64_t txg) * The max queue depth will not change in the middle of syncing * out this txg. */ - uint64_t queue_depth_total = 0; + uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -7729,18 +7731,23 @@ spa_sync(spa_t *spa, uint64_t txg) * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ - ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; - queue_depth_total += mg->mg_max_alloc_queue_depth; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mg->mg_cur_max_alloc_queue_depth[i] = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; } metaslab_class_t *mc = spa_normal_class(spa); - ASSERT0(refcount_count(&mc->mc_alloc_slots)); - mc->mc_alloc_max_slots = queue_depth_total; + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); + mc->mc_alloc_max_slots[i] = slots_per_allocator; + } mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - ASSERT3U(mc->mc_alloc_max_slots, <=, - max_queue_depth * rvd->vdev_children); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); @@ -7920,9 +7927,11 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * Update usable space statistics. diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f43a38ef1..44ceb42d4 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -375,6 +375,8 @@ int spa_asize_inflation = 24; */ int spa_slop_shift = 5; uint64_t spa_min_slop = 128 * 1024 * 1024; +int spa_allocators = 4; + /*PRINTFLIKE2*/ void @@ -624,7 +626,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -660,8 +661,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - avl_create(&spa->spa_alloc_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + spa->spa_alloc_count = spa_allocators; + spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * + sizeof (kmutex_t), KM_SLEEP); + spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * + sizeof (avl_tree_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); + avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + } /* * Every pool starts with the default cachefile @@ -740,7 +749,15 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } - avl_destroy(&spa->spa_alloc_tree); + for (int i = 0; i < spa->spa_alloc_count; i++) { + avl_destroy(&spa->spa_alloc_trees[i]); + mutex_destroy(&spa->spa_alloc_locks[i]); + } + kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * + sizeof (kmutex_t)); + kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * + sizeof (avl_tree_t)); + list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); @@ -764,7 +781,6 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); - mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c35f73923..00e1fbfa2 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -709,7 +709,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd); + spa_log_class(spa) : spa_normal_class(spa), vd, + spa->spa_alloc_count); } if (vd->vdev_ops->vdev_op_leaf && @@ -1145,7 +1146,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { uint64_t object = 0; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 75a123ece..30a883f85 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -191,6 +191,15 @@ int zfs_vdev_queue_depth_pct = 1000; int zfs_vdev_queue_depth_pct = 300; #endif +/* + * When performing allocations for a given metaslab, we want to make sure that + * there are enough IOs to aggregate together to improve throughput. We want to + * ensure that there are at least 128k worth of IOs that can be aggregated, and + * we assume that the average allocation size is 4k, so we need the queue depth + * to be 32 per allocator to get good aggregation of sequential writes. + */ +int zfs_vdev_def_queue_depth = 32; + int vdev_queue_offset_compare(const void *x1, const void *x2) diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index f2bdd6389..dcce93c70 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -945,7 +945,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, ASSERT3U(size, <=, maxalloc); int error = metaslab_alloc_dva(spa, mg->mg_class, size, - &dst, 0, NULL, txg, 0, zal); + &dst, 0, NULL, txg, 0, zal, 0); if (error != 0) return (error); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index f7c793d40..8b7aeb5c3 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 9a98d4fc0..565a78c89 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -45,6 +45,7 @@ #include <sys/trace_zio.h> #include <sys/abd.h> #include <sys/dsl_crypt.h> +#include <sys/cityhash.h> /* * ========================================================================== @@ -2611,7 +2612,8 @@ zio_write_gang_block(zio_t *pio) ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator], + pio)); /* * The logical zio has already placed a reservation for @@ -2622,12 +2624,12 @@ zio_write_gang_block(zio_t *pio) * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, - pio, flags)); + pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio); + &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2641,7 +2643,7 @@ zio_write_gang_block(zio_t *pio) * stage. */ metaslab_class_throttle_unreserve(mc, - gbh_copies - copies, pio); + gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; @@ -2705,7 +2707,7 @@ zio_write_gang_block(zio_t *pio) * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, - zp.zp_copies, cio, flags)); + zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } @@ -3223,13 +3225,13 @@ zio_ddt_free(zio_t *zio) */ static zio_t * -zio_io_to_allocate(spa_t *spa) +zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); - zio = avl_first(&spa->spa_alloc_tree); + zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); @@ -3239,12 +3241,13 @@ zio_io_to_allocate(spa_t *spa) * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ + ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(spa_normal_class(spa), - zio->io_prop.zp_copies, zio, 0)) { + zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } - avl_remove(&spa->spa_alloc_tree, zio); + avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); @@ -3268,13 +3271,23 @@ zio_dva_throttle(zio_t *zio) ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - mutex_enter(&spa->spa_alloc_lock); + zbookmark_phys_t *bm = &zio->io_bookmark; + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; + mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); - avl_add(&spa->spa_alloc_tree, zio); + avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(zio->io_spa); - mutex_exit(&spa->spa_alloc_lock); + nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); if (nio == zio) return (ZIO_PIPELINE_CONTINUE); @@ -3295,13 +3308,13 @@ zio_dva_throttle(zio_t *zio) } void -zio_allocate_dispatch(spa_t *spa) +zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; - mutex_enter(&spa->spa_alloc_lock); - zio = zio_io_to_allocate(spa); - mutex_exit(&spa->spa_alloc_lock); + mutex_enter(&spa->spa_alloc_locks[allocator]); + zio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; @@ -3340,7 +3353,7 @@ zio_dva_allocate(zio_t *zio) error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio); + &zio->io_alloc_list, zio, zio->io_allocator); if (error != 0) { zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " @@ -3409,14 +3422,23 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); + /* + * When allocating a zil block, we don't have information about + * the final destination of the block except the objset it's part + * of, so we just hash the objset ID to pick the allocator to get + * some parallelism. + */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL); + txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, + cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL); + &io_alloc_list, NULL, cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) *slog = FALSE; } @@ -4119,8 +4141,8 @@ zio_ready(zio_t *zio) */ metaslab_class_throttle_unreserve( spa_normal_class(zio->io_spa), - zio->io_prop.zp_copies, zio); - zio_allocate_dispatch(zio->io_spa); + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } @@ -4204,18 +4226,19 @@ zio_dva_throttle_done(zio_t *zio) ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); mutex_enter(&pio->io_lock); - metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, + pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), - 1, pio); + 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ - zio_allocate_dispatch(zio->io_spa); + zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static int @@ -4227,6 +4250,7 @@ zio_done(zio_t *zio) */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; + ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa)); zio_link_t *zl = NULL; /* @@ -4245,8 +4269,7 @@ zio_done(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { - ASSERT(spa_normal_class( - zio->io_spa)->mc_alloc_throttle_enabled); + ASSERT(mc->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } @@ -4258,9 +4281,10 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); - metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio); - VERIFY(refcount_not_held( - &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio)); + metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, + zio->io_allocator); + VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator], + zio)); } |