diff options
author | Matthew Ahrens <[email protected]> | 2020-12-16 14:40:05 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-12-16 14:40:05 -0800 |
commit | be5c6d96530e19efde7c0af771f9ddb0073ef751 (patch) | |
tree | 86ec18586ac7464f1154ba881cc0d89ff9270b79 /module/zfs | |
parent | f8020c936356b887ab1e03eba1a723f9dfda6eea (diff) |
Only examine best metaslabs on each vdev
On a system with very high fragmentation, we may need to do lots of gang
allocations (e.g. most indirect block allocations (~50KB) may need to
gang). Before failing a "normal" allocation and resorting to ganging, we
try every metaslab. This has the impact of loading every metaslab (not
a huge deal since we now typically keep all metaslabs loaded), and also
iterating over every metaslab for every failing allocation. If there are
many metaslabs (more than the typical ~200, e.g. due to vdev expansion
or very large vdevs), the CPU cost of this iteration can be very
impactful. This iteration is done with the mg_lock held, creating long
hold times and high lock contention for concurrent allocations,
ultimately causing long txg sync times and poor application performance.
To address this, this commit changes the behavior of "normal" (not
try_hard, not ZIL) allocations. These will now only examine the 100
best metaslabs (as determined by their ms_weight). If none of these
have a large enough free segment, then the allocation will fail and
we'll fall back on ganging.
To accomplish this, we will now (normally) gang before doing a
`try_hard` allocation. Non-try_hard allocations will only examine the
100 best metaslabs of each vdev. In summary, we will first try normal
allocation. If that fails then we will do a gang allocation. If that
fails then we will do a "try hard" gang allocation. If that fails then
we will have a multi-layer gang block.
Reviewed-by: Paul Dagnelie <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Matthew Ahrens <[email protected]>
Closes #11327
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/metaslab.c | 100 | ||||
-rw-r--r-- | module/zfs/zio.c | 15 |
2 files changed, 60 insertions, 55 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ea1720a2c..bed6bf64c 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -264,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2; * Internal switch to enable/disable the metaslab allocation tracing * facility. */ -#ifdef _METASLAB_TRACING -boolean_t metaslab_trace_enabled = B_TRUE; -#endif +boolean_t metaslab_trace_enabled = B_FALSE; /* * Maximum entries that the metaslab allocation tracing facility will keep @@ -276,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE; * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ -#ifdef _METASLAB_TRACING uint64_t metaslab_trace_max_entries = 5000; -#endif /* * Maximum number of metaslabs per group that can be disabled @@ -314,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE; */ uint32_t metaslab_by_size_min_shift = 14; +/* + * If not set, we will first try normal allocation. If that fails then + * we will do a gang allocation. If that fails then we will do a "try hard" + * gang allocation. If that fails then we will have a multi-layer gang + * block. + * + * If set, we will first try normal allocation. If that fails then + * we will do a "try hard" allocation. If that fails we will do a gang + * allocation. If that fails we will do a "try hard" gang allocation. If + * that fails then we will have a multi-layer gang block. + */ +int zfs_metaslab_try_hard_before_gang = B_FALSE; + +/* + * When not trying hard, we only consider the best zfs_metaslab_find_max_tries + * metaslabs. This improves performance, especially when there are many + * metaslabs per vdev and the allocation can't actually be satisfied (so we + * would otherwise iterate all the metaslabs). If there is a metaslab with a + * worse weight but it can actually satisfy the allocation, we won't find it + * until trying hard. This may happen if the worse metaslab is not loaded + * (and the true weight is better than we have calculated), or due to weight + * bucketization. E.g. we are looking for a 60K segment, and the best + * metaslabs all have free segments in the 32-63K bucket, but the best + * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a + * subsequent metaslab has ms_max_size >60KB (but fewer segments in this + * bucket, and therefore a lower weight). + */ +int zfs_metaslab_find_max_tries = 100; + static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -325,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); -#ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; - kstat_named_t metaslabstat_df_find_under_floor; kstat_named_t metaslabstat_reload_tree; + kstat_named_t metaslabstat_too_many_tries; + kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, - { "df_find_under_floor", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, + { "too_many_tries", KSTAT_DATA_UINT64 }, + { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ @@ -373,18 +399,6 @@ metaslab_stat_fini(void) kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } -#else - -void -metaslab_stat_init(void) -{ -} - -void -metaslab_stat_fini(void) -{ -} -#endif /* * ========================================================================== @@ -1355,9 +1369,7 @@ static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; -#ifdef _METASLAB_TRACING METASLABSTAT_BUMP(metaslabstat_reload_tree); -#endif ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; @@ -1667,13 +1679,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ -#ifdef _METASLAB_TRACING - metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; - if (size < (1 << mrap->mra_floor_shift)) { - METASLABSTAT_BUMP( - metaslabstat_df_find_under_floor); - } -#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } @@ -4404,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * Metaslab allocation tracing facility * ========================================================================== */ -#ifdef _METASLAB_TRACING /* * Add an allocation trace element to the allocation tracing list. @@ -4479,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) list_destroy(&zal->zal_list); zal->zal_size = 0; } -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ /* * ========================================================================== @@ -4634,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); + int tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; + + if (!try_hard && tries > zfs_metaslab_find_max_tries) { + METASLABSTAT_BUMP(metaslabstat_too_many_tries); + return (NULL); + } + tries++; + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5287,9 +5284,12 @@ next: } while ((mg = mg->mg_next) != rotor); /* - * If we haven't tried hard, do so now. + * If we haven't tried hard, perhaps do so now. */ - if (!try_hard) { + if (!try_hard && (zfs_metaslab_try_hard_before_gang || + GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || + psize <= 1 << spa->spa_min_ashift)) { + METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } @@ -6245,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, + ZMOD_RW, "Try hard to allocate before ganging"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, + "Normally only consider this many of the best metaslabs in each vdev"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ba438353a..3c2b731f7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3585,17 +3585,16 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, - cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % - spa->spa_alloc_count); + int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count; + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) { *slog = TRUE; } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL, cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); + error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) *slog = FALSE; } |