Metaslab max_size should be persisted while unloaded

When we unload metaslabs today in ZFS, the cached max_size value is discarded. We instead use the histogram to determine whether or not we think we can satisfy an allocation from the metaslab. This can result in situations where, if we're doing I/Os of a size not aligned to a histogram bucket, a metaslab is loaded even though it cannot satisfy the allocation we think it can. For example, a metaslab with 16 entries in the 16k-32k bucket may have entirely 16kB entries. If we try to allocate a 24kB buffer, we will load that metaslab because we think it should be able to handle the allocation. Doing so is expensive in CPU time, disk reads, and average IO latency. This is exacerbated if the write being attempted is a sync write. This change makes ZFS cache the max_size after the metaslab is unloaded. If we ever get a free (or a coalesced group of frees) larger than the max_size, we will update it. Otherwise, we leave it as is. When attempting to allocate, we use the max_size as a lower bound, and respect it unless we are in try_hard. However, we do age the max_size out at some point, since we expect the actual max_size to increase as we do more frees. A more sophisticated algorithm here might be helpful, but this works reasonably well. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Signed-off-by: Paul Dagnelie <[email protected]> Closes #9055
author: Paul Dagnelie <[email protected]> 2019-08-05 14:34:27 -0700
committer: Matthew Ahrens <[email protected]> 2019-08-05 14:34:27 -0700
commit: c81f1790e28a547769ce9d8e2d25391cf4317cc0 (patch)
tree: 4bad0800ec2e06ff51bb73eaaeabe2c85ca18cbe /module
parent: 99e755d653aec351c43a636b5734d1de3476d233 (diff)
2 files changed, 162 insertions, 37 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 02b913780..9a9a5e0cf 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -272,6 +272,12 @@ uint64_t metaslab_trace_max_entries = 5000;
  */
 int max_disabled_ms = 3;
 
+/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -1165,17 +1171,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
-metaslab_block_maxsize(metaslab_t *msp)
+metaslab_largest_allocatable(metaslab_t *msp)
 {
 	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
-	if (t == NULL || (rs = avl_last(t)) == NULL)
-		return (0ULL);
+	if (t == NULL)
+		return (0);
+	rs = avl_last(t);
+	if (rs == NULL)
+		return (0);
 
 	return (rs->rs_end - rs->rs_start);
 }
 
+/*
+ * Return the maximum contiguous segment within the unflushed frees of this
+ * metaslab.
+ */
+uint64_t
+metaslab_largest_unflushed_free(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if (msp->ms_unflushed_frees == NULL)
+		return (0);
+
+	range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+	if (rs == NULL)
+		return (0);
+
+	/*
+	 * When a range is freed from the metaslab, that range is added to
+	 * both the unflushed frees and the deferred frees. While the block
+	 * will eventually be usable, if the metaslab were loaded the range
+	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
+	 * txgs had passed.  As a result, when attempting to estimate an upper
+	 * bound for the largest currently-usable free segment in the
+	 * metaslab, we need to not consider any ranges currently in the defer
+	 * trees. This algorithm approximates the largest available chunk in
+	 * the largest range in the unflushed_frees tree by taking the first
+	 * chunk.  While this may be a poor estimate, it should only remain so
+	 * briefly and should eventually self-correct as frees are no longer
+	 * deferred. Similar logic applies to the ms_freed tree. See
+	 * metaslab_load() for more details.
+	 *
+	 * There are two primary sources of innacuracy in this estimate. Both
+	 * are tolerated for performance reasons. The first source is that we
+	 * only check the largest segment for overlaps. Smaller segments may
+	 * have more favorable overlaps with the other trees, resulting in
+	 * larger usable chunks.  Second, we only look at the first chunk in
+	 * the largest segment; there may be other usable chunks in the
+	 * largest segment, but we ignore them.
+	 */
+	uint64_t rstart = rs->rs_start;
+	uint64_t rsize = rs->rs_end - rstart;
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		uint64_t start = 0;
+		uint64_t size = 0;
+		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
+		    rsize, &start, &size);
+		if (found) {
+			if (rstart == start)
+				return (0);
+			rsize = start - rstart;
+		}
+	}
+
+	uint64_t start = 0;
+	uint64_t size = 0;
+	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
+	    rsize, &start, &size);
+	if (found)
+		rsize = start - rstart;
+
+	return (rsize);
+}
+
 static range_seg_t *
 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
 {
@@ -1269,7 +1341,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
-	if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
+	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
@@ -1375,7 +1447,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
-	uint64_t max_size = metaslab_block_maxsize(msp);
+	uint64_t max_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==,
@@ -1693,7 +1765,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
-	msp->ms_max_size = 0;
 
 	/*
 	 * This function is used for verification purposes. Regardless of
@@ -1883,18 +1954,21 @@ metaslab_load_impl(metaslab_t *msp)
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
+	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
-	msp->ms_max_size = metaslab_block_maxsize(msp);
-
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
+	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
+		msp->ms_load_time = load_end;
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, smp_length %llu, "
 		    "unflushed_allocs %llu, unflushed_frees %llu, "
 		    "freed %llu, defer %llu + %llu, "
-		    "loading_time %lld ms",
+		    "loading_time %lld ms, ms_max_size %llu, "
+		    "max size error %llu",
 		    spa_syncing_txg(spa), spa_name(spa),
 		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
 		    space_map_length(msp->ms_sm),
@@ -1903,7 +1977,8 @@ metaslab_load_impl(metaslab_t *msp)
 		    range_tree_space(msp->ms_freed),
 		    range_tree_space(msp->ms_defer[0]),
 		    range_tree_space(msp->ms_defer[1]),
-		    (longlong_t)((load_end - load_start) / 1000000));
+		    (longlong_t)((load_end - load_start) / 1000000),
+		    msp->ms_max_size, msp->ms_max_size - max_size);
 	}
 
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
@@ -1967,10 +2042,10 @@ metaslab_unload(metaslab_t *msp)
 
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
+	msp->ms_unload_time = gethrtime();
 
 	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
-	msp->ms_max_size = 0;
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
@@ -2527,13 +2602,19 @@ metaslab_segment_weight(metaslab_t *msp)
  * weights we rely on the entire weight (excluding the weight-type bit).
  */
 boolean_t
-metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
-	if (msp->ms_loaded) {
+	/*
+	 * If the metaslab is loaded, ms_max_size is definitive and we can use
+	 * the fast check. If it's not, the ms_max_size is a lower bound (once
+	 * set), and we should use the fast check as long as we're not in
+	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
+	 * seconds since the metaslab was unloaded.
+	 */
+	if (msp->ms_loaded ||
+	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
+	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
-	} else {
-		ASSERT0(msp->ms_max_size);
-	}
 
 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
@@ -2571,14 +2652,21 @@ metaslab_weight(metaslab_t *msp)
 	metaslab_set_fragmentation(msp);
 
 	/*
-	 * Update the maximum size if the metaslab is loaded. This will
+	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
-	 * has been added back into the free tree.
+	 * has been added back into the free tree. If the metaslab is
+	 * unloaded, we check if there's a larger free segment in the
+	 * unflushed frees. This is a lower bound on the largest allocatable
+	 * segment size. Coalescing of adjacent entries may reveal larger
+	 * allocatable segments, but we aren't aware of those until loading
+	 * the space map into a range tree.
 	 */
-	if (msp->ms_loaded)
-		msp->ms_max_size = metaslab_block_maxsize(msp);
-	else
-		ASSERT0(msp->ms_max_size);
+	if (msp->ms_loaded) {
+		msp->ms_max_size = metaslab_largest_allocatable(msp);
+	} else {
+		msp->ms_max_size = MAX(msp->ms_max_size,
+		    metaslab_largest_unflushed_free(msp));
+	}
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
@@ -3595,7 +3683,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
 		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
 		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
-		msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+		msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
+		    &msp->ms_unflushed_frees_by_size,
+		    metaslab_rangesize_compare, 0);
 
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
@@ -3992,7 +4082,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
-	msp->ms_max_size = metaslab_block_maxsize(msp);
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }
 
@@ -4010,7 +4100,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
-    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+    boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
+    boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -4020,7 +4111,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
-		if (!metaslab_should_allocate(msp, asize)) {
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
@@ -4100,8 +4191,8 @@ metaslab_active_mask_verify(metaslab_t *msp)
 /* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
@@ -4174,8 +4265,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			was_active = B_TRUE;
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
-			    want_unique, asize, allocator, zal, search,
-			    &was_active);
+			    want_unique, asize, allocator, try_hard, zal,
+			    search, &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
@@ -4282,7 +4373,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
-		if (!metaslab_should_allocate(msp, asize)) {
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
@@ -4360,7 +4451,7 @@ next:
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-			weight = metaslab_block_maxsize(msp);
+			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
@@ -4392,7 +4483,7 @@ next:
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
-		ASSERT(!metaslab_should_allocate(msp, asize));
+		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 
 		mutex_exit(&msp->ms_lock);
 	}
@@ -4403,14 +4494,14 @@ next:
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
-	    dva, d, allocator);
+	    dva, d, allocator, try_hard);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
@@ -4592,7 +4683,7 @@ top:
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    !try_hard, dva, d, allocator);
+		    !try_hard, dva, d, allocator, try_hard);
 
 		if (offset != -1ULL) {
 			/*
@@ -5615,6 +5706,10 @@ MODULE_PARM_DESC(metaslab_df_max_search,
 module_param(metaslab_df_use_largest_segment, int, 0644);
 MODULE_PARM_DESC(metaslab_df_use_largest_segment,
 	"when looking in size tree, use largest segment instead of exact fit");
+
+module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644);
+MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec,
+	"how long to trust the cached max chunk size of a metaslab");
 /* END CSTYLED */
 
 #endif
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 5919236d9..0e1297214 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -525,6 +525,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
 }
 
 /*
+ * Returns the first subset of the given range which overlaps with the range
+ * tree. Returns true if there is a segment in the range, and false if there
+ * isn't.
+ */
+boolean_t
+range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+    uint64_t *ostart, uint64_t *osize)
+{
+	range_seg_t rsearch;
+	rsearch.rs_start = start;
+	rsearch.rs_end = start + 1;
+
+	avl_index_t where;
+	range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where);
+	if (rs != NULL) {
+		*ostart = start;
+		*osize = MIN(size, rs->rs_end - start);
+		return (B_TRUE);
+	}
+
+	rs = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+	if (rs == NULL || rs->rs_start > start + size)
+		return (B_FALSE);
+
+	*ostart = rs->rs_start;
+	*osize = MIN(start + size, rs->rs_end) - rs->rs_start;
+	return (B_TRUE);
+}
+
+/*
  * Ensure that this range is not in the tree, regardless of whether
  * it is currently in the tree.
  */
author	Paul Dagnelie <[email protected]>	2019-08-05 14:34:27 -0700
committer	Matthew Ahrens <[email protected]>	2019-08-05 14:34:27 -0700
commit	c81f1790e28a547769ce9d8e2d25391cf4317cc0 (patch)
tree	4bad0800ec2e06ff51bb73eaaeabe2c85ca18cbe /module
parent	99e755d653aec351c43a636b5734d1de3476d233 (diff)