diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/metaslab.c | 169 | ||||
-rw-r--r-- | module/zfs/range_tree.c | 30 |
2 files changed, 162 insertions, 37 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 02b913780..9a9a5e0cf 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -272,6 +272,12 @@ uint64_t metaslab_trace_max_entries = 5000; */ int max_disabled_ms = 3; +/* + * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. + * To avoid 64-bit overflow, don't set above UINT32_MAX. + */ +unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ + static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -1165,17 +1171,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2) * Return the maximum contiguous segment within the metaslab. */ uint64_t -metaslab_block_maxsize(metaslab_t *msp) +metaslab_largest_allocatable(metaslab_t *msp) { avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; - if (t == NULL || (rs = avl_last(t)) == NULL) - return (0ULL); + if (t == NULL) + return (0); + rs = avl_last(t); + if (rs == NULL) + return (0); return (rs->rs_end - rs->rs_start); } +/* + * Return the maximum contiguous segment within the unflushed frees of this + * metaslab. + */ +uint64_t +metaslab_largest_unflushed_free(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (msp->ms_unflushed_frees == NULL) + return (0); + + range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (rs == NULL) + return (0); + + /* + * When a range is freed from the metaslab, that range is added to + * both the unflushed frees and the deferred frees. While the block + * will eventually be usable, if the metaslab were loaded the range + * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE + * txgs had passed. As a result, when attempting to estimate an upper + * bound for the largest currently-usable free segment in the + * metaslab, we need to not consider any ranges currently in the defer + * trees. This algorithm approximates the largest available chunk in + * the largest range in the unflushed_frees tree by taking the first + * chunk. While this may be a poor estimate, it should only remain so + * briefly and should eventually self-correct as frees are no longer + * deferred. Similar logic applies to the ms_freed tree. See + * metaslab_load() for more details. + * + * There are two primary sources of innacuracy in this estimate. Both + * are tolerated for performance reasons. The first source is that we + * only check the largest segment for overlaps. Smaller segments may + * have more favorable overlaps with the other trees, resulting in + * larger usable chunks. Second, we only look at the first chunk in + * the largest segment; there may be other usable chunks in the + * largest segment, but we ignore them. + */ + uint64_t rstart = rs->rs_start; + uint64_t rsize = rs->rs_end - rstart; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, + rsize, &start, &size); + if (found) { + if (rstart == start) + return (0); + rsize = start - rstart; + } + } + + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + rsize, &start, &size); + if (found) + rsize = start - rstart; + + return (rsize); +} + static range_seg_t * metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { @@ -1269,7 +1341,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ - if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || + if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { @@ -1375,7 +1447,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_block_maxsize(msp); + uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, @@ -1693,7 +1765,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) msp->ms_weight = 0; msp->ms_fragmentation = 0; - msp->ms_max_size = 0; /* * This function is used for verification purposes. Regardless of @@ -1883,18 +1954,21 @@ metaslab_load_impl(metaslab_t *msp) * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; + uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); - msp->ms_max_size = metaslab_block_maxsize(msp); - + msp->ms_max_size = metaslab_largest_allocatable(msp); + ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); + msp->ms_load_time = load_end; if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, " - "loading_time %lld ms", + "loading_time %lld ms, ms_max_size %llu, " + "max size error %llu", spa_syncing_txg(spa), spa_name(spa), msp->ms_group->mg_vd->vdev_id, msp->ms_id, space_map_length(msp->ms_sm), @@ -1903,7 +1977,8 @@ metaslab_load_impl(metaslab_t *msp) range_tree_space(msp->ms_freed), range_tree_space(msp->ms_defer[0]), range_tree_space(msp->ms_defer[1]), - (longlong_t)((load_end - load_start) / 1000000)); + (longlong_t)((load_end - load_start) / 1000000), + msp->ms_max_size, msp->ms_max_size - max_size); } metaslab_verify_space(msp, spa_syncing_txg(spa)); @@ -1967,10 +2042,10 @@ metaslab_unload(metaslab_t *msp) range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; - msp->ms_max_size = 0; /* * We explicitly recalculate the metaslab's weight based on its space @@ -2527,13 +2602,19 @@ metaslab_segment_weight(metaslab_t *msp) * weights we rely on the entire weight (excluding the weight-type bit). */ boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { - if (msp->ms_loaded) { + /* + * If the metaslab is loaded, ms_max_size is definitive and we can use + * the fast check. If it's not, the ms_max_size is a lower bound (once + * set), and we should use the fast check as long as we're not in + * try_hard and it's been less than zfs_metaslab_max_size_cache_sec + * seconds since the metaslab was unloaded. + */ + if (msp->ms_loaded || + (msp->ms_max_size != 0 && !try_hard && gethrtime() < + msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); - } else { - ASSERT0(msp->ms_max_size); - } boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { @@ -2571,14 +2652,21 @@ metaslab_weight(metaslab_t *msp) metaslab_set_fragmentation(msp); /* - * Update the maximum size if the metaslab is loaded. This will + * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space - * has been added back into the free tree. + * has been added back into the free tree. If the metaslab is + * unloaded, we check if there's a larger free segment in the + * unflushed frees. This is a lower bound on the largest allocatable + * segment size. Coalescing of adjacent entries may reveal larger + * allocatable segments, but we aren't aware of those until loading + * the space map into a range tree. */ - if (msp->ms_loaded) - msp->ms_max_size = metaslab_block_maxsize(msp); - else - ASSERT0(msp->ms_max_size); + if (msp->ms_loaded) { + msp->ms_max_size = metaslab_largest_allocatable(msp); + } else { + msp->ms_max_size = MAX(msp->ms_max_size, + metaslab_largest_unflushed_free(msp)); + } /* * Segment-based weighting requires space map histogram support. @@ -3595,7 +3683,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create(NULL, NULL); + msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, + &msp->ms_unflushed_frees_by_size, + metaslab_rangesize_compare, 0); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } @@ -3992,7 +4082,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ - msp->ms_max_size = metaslab_block_maxsize(msp); + msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } @@ -4010,7 +4100,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, - zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) + boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, + boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -4020,7 +4111,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; @@ -4100,8 +4191,8 @@ metaslab_active_mask_verify(metaslab_t *msp) /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -4174,8 +4265,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, zal, search, - &was_active); + want_unique, asize, allocator, try_hard, zal, + search, &was_active); } mutex_exit(&mg->mg_lock); @@ -4282,7 +4373,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -4360,7 +4451,7 @@ next: */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - weight = metaslab_block_maxsize(msp); + weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); @@ -4392,7 +4483,7 @@ next: * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } @@ -4403,14 +4494,14 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator); + dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -4592,7 +4683,7 @@ top: * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator); + !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* @@ -5615,6 +5706,10 @@ MODULE_PARM_DESC(metaslab_df_max_search, module_param(metaslab_df_use_largest_segment, int, 0644); MODULE_PARM_DESC(metaslab_df_use_largest_segment, "when looking in size tree, use largest segment instead of exact fit"); + +module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644); +MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec, + "how long to trust the cached max chunk size of a metaslab"); /* END CSTYLED */ #endif diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 5919236d9..0e1297214 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -525,6 +525,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) } /* + * Returns the first subset of the given range which overlaps with the range + * tree. Returns true if there is a segment in the range, and false if there + * isn't. + */ +boolean_t +range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize) +{ + range_seg_t rsearch; + rsearch.rs_start = start; + rsearch.rs_end = start + 1; + + avl_index_t where; + range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + if (rs != NULL) { + *ostart = start; + *osize = MIN(size, rs->rs_end - start); + return (B_TRUE); + } + + rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); + if (rs == NULL || rs->rs_start > start + size) + return (B_FALSE); + + *ostart = rs->rs_start; + *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + return (B_TRUE); +} + +/* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. */ |