Set aside a metaslab for ZIL blocks

Mixing ZIL and normal allocations has several problems: 1. The ZIL allocations are allocated, written to disk, and then a few seconds later freed. This leaves behind holes (free segments) where the ZIL blocks used to be, which increases fragmentation, which negatively impacts performance. 2. When under moderate load, ZIL allocations are of 128KB. If the pool is fairly fragmented, there may not be many free chunks of that size. This causes ZFS to load more metaslabs to locate free segments of 128KB or more. The loading happens synchronously (from zil_commit()), and can take around a second even if the metaslab's spacemap is cached in the ARC. All concurrent synchronous operations on this filesystem must wait while the metaslab is loading. This can cause a significant performance impact. 3. If the pool is very fragmented, there may be zero free chunks of 128KB or more. In this case, the ZIL falls back to txg_wait_synced(), which has an enormous performance impact. These problems can be eliminated by using a dedicated log device ("slog"), even one with the same performance characteristics as the normal devices. This change sets aside one metaslab from each top-level vdev that is preferentially used for ZIL allocations (vdev_log_mg, spa_embedded_log_class). From an allocation perspective, this is similar to having a dedicated log device, and it eliminates the above-mentioned performance problems. Log (ZIL) blocks can be allocated from the following locations. Each one is tried in order until the allocation succeeds: 1. dedicated log vdevs, aka "slog" (spa_log_class) 2. embedded slog metaslabs (spa_embedded_log_class) 3. other metaslabs in normal vdevs (spa_normal_class) The space required for the embedded slog metaslabs is usually between 0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop" space that is not available for user data. On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity, and recordsize=8k, testing shows a ~50% performance increase on random 8k sync writes. On even more fragmented systems (which hit problem #3 above and call txg_wait_synced()), the performance improvement can be arbitrarily large (>100x). Reviewed-by: Serapheim Dimitropoulos <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Don Brady <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11389
author: Matthew Ahrens <[email protected]> 2021-01-21 15:12:54 -0800
committer: GitHub <[email protected]> 2021-01-21 15:12:54 -0800
commit: aa755b35493a2d27dbbc59f1527c6c93788b28f6 (patch)
tree: a0be60ea189e4b89772e57a7628b6a2dea4bdcce /module/zfs
parent: 984362a71ebef106d05a8f68f241c5c610ff6547 (diff)
6 files changed, 314 insertions, 99 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index bed6bf64c..fdc6922b0 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -524,7 +524,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
+		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
@@ -535,12 +535,16 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
 			continue;
 		}
 
+		IMPLY(mg == mg->mg_vd->vdev_log_mg,
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+
 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+	}
 
 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
@@ -1004,16 +1008,22 @@ metaslab_group_initialized(metaslab_group_t *mg)
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
-	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
+	/*
+	 * Note that the number of nodes in mg_metaslab_tree may be one less
+	 * than vdev_ms_count, due to the embedded log metaslab.
+	 */
+	mutex_enter(&mg->mg_lock);
+	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
+	mutex_exit(&mg->mg_lock);
+	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
-	vdev_t *vd = mg->mg_vd;
-	uint64_t ashift = vd->vdev_ashift;
-	int i;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
@@ -1024,21 +1034,25 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
-	for (int m = 0; m < vd->vdev_ms_count; m++) {
-		metaslab_t *msp = vd->vdev_ms[m];
-
-		/* skip if not active or not a member */
-		if (msp->ms_sm == NULL || msp->ms_group != mg)
+	mutex_enter(&mg->mg_lock);
+	for (metaslab_t *msp = avl_first(t);
+	    msp != NULL; msp = AVL_NEXT(t, msp)) {
+		VERIFY3P(msp->ms_group, ==, mg);
+		/* skip if not active */
+		if (msp->ms_sm == NULL)
 			continue;
 
-		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
+		}
 	}
 
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
+	mutex_exit(&mg->mg_lock);
+
 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
@@ -1054,6 +1068,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 
 	mutex_enter(&mg->mg_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		IMPLY(mg == mg->mg_vd->vdev_log_mg,
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
@@ -1078,6 +1094,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
+		IMPLY(mg == mg->mg_vd->vdev_log_mg,
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
@@ -2741,37 +2759,47 @@ metaslab_fini(metaslab_t *msp)
 
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
-	metaslab_space_update(vd, mg->mg_class,
-	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
+	/*
+	 * If the range trees haven't been allocated, this metaslab hasn't
+	 * been through metaslab_sync_done() for the first time yet, so its
+	 * space hasn't been accounted for in its vdev and doesn't need to be
+	 * subtracted.
+	 */
+	if (msp->ms_freed != NULL) {
+		metaslab_space_update(vd, mg->mg_class,
+		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
+	}
 	space_map_close(msp->ms_sm);
 	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
+
 	range_tree_destroy(msp->ms_allocatable);
-	range_tree_destroy(msp->ms_freeing);
-	range_tree_destroy(msp->ms_freed);
 
-	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
-	    metaslab_unflushed_changes_memused(msp));
-	spa->spa_unflushed_stats.sus_memused -=
-	    metaslab_unflushed_changes_memused(msp);
-	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
-	range_tree_destroy(msp->ms_unflushed_allocs);
-	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
-	range_tree_destroy(msp->ms_unflushed_frees);
+	if (msp->ms_freed != NULL) {
+		range_tree_destroy(msp->ms_freeing);
+		range_tree_destroy(msp->ms_freed);
 
-	for (int t = 0; t < TXG_SIZE; t++) {
-		range_tree_destroy(msp->ms_allocating[t]);
-	}
+		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+		    metaslab_unflushed_changes_memused(msp));
+		spa->spa_unflushed_stats.sus_memused -=
+		    metaslab_unflushed_changes_memused(msp);
+		range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+		range_tree_destroy(msp->ms_unflushed_allocs);
+		range_tree_destroy(msp->ms_checkpointing);
+		range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+		range_tree_destroy(msp->ms_unflushed_frees);
 
-	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_destroy(msp->ms_defer[t]);
+		for (int t = 0; t < TXG_SIZE; t++) {
+			range_tree_destroy(msp->ms_allocating[t]);
+		}
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_destroy(msp->ms_defer[t]);
+		}
 	}
 	ASSERT0(msp->ms_deferspace);
 
-	range_tree_destroy(msp->ms_checkpointing);
-
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
@@ -5113,7 +5141,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		 * all else fails.
 		 */
 		if (vd != NULL && vd->vdev_mg != NULL) {
-			mg = vd->vdev_mg;
+			mg = vdev_get_mg(vd, mc);
 
 			if (flags & METASLAB_HINTBP_AVOID &&
 			    mg->mg_next != NULL)
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 53ffbc31c..57a492993 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -303,10 +303,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 		alloc = metaslab_class_get_alloc(mc);
 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
 
 		size = metaslab_class_get_space(mc);
 		size += metaslab_class_get_space(spa_special_class(spa));
 		size += metaslab_class_get_space(spa_dedup_class(spa));
+		size += metaslab_class_get_space(spa_embedded_log_class(spa));
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
@@ -1196,6 +1198,8 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_embedded_log_class =
+	    metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
@@ -1347,6 +1351,9 @@ spa_deactivate(spa_t *spa)
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
+	metaslab_class_destroy(spa->spa_embedded_log_class);
+	spa->spa_embedded_log_class = NULL;
+
 	metaslab_class_destroy(spa->spa_special_class);
 	spa->spa_special_class = NULL;
 
@@ -2103,6 +2110,9 @@ spa_check_logs(spa_t *spa)
 	return (rv);
 }
 
+/*
+ * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
@@ -2113,10 +2123,10 @@ spa_passivate_log(spa_t *spa)
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
-			metaslab_group_passivate(mg);
+			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+			metaslab_group_passivate(tvd->vdev_mg);
 			slog_found = B_TRUE;
 		}
 	}
@@ -2124,6 +2134,9 @@ spa_passivate_log(spa_t *spa)
 	return (slog_found);
 }
 
+/*
+ * Activate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
 static void
 spa_activate_log(spa_t *spa)
 {
@@ -2133,10 +2146,11 @@ spa_activate_log(spa_t *spa)
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
 
-		if (tvd->vdev_islog)
-			metaslab_group_activate(mg);
+		if (tvd->vdev_islog) {
+			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+			metaslab_group_activate(tvd->vdev_mg);
+		}
 	}
 }
 
@@ -8033,12 +8047,16 @@ spa_async_thread(void *arg)
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		old_space += metaslab_class_get_space(spa_special_class(spa));
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
+		old_space += metaslab_class_get_space(
+		    spa_embedded_log_class(spa));
 
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		new_space += metaslab_class_get_space(spa_special_class(spa));
 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
+		new_space += metaslab_class_get_space(
+		    spa_embedded_log_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index f49be8eec..b4c73f58d 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -349,9 +349,11 @@ int spa_asize_inflation = 24;
  * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
  * the pool to be consumed.  This ensures that we don't run the pool
  * completely out of space, due to unaccounted changes (e.g. to the MOS).
- * It also limits the worst-case time to allocate space.  If we have
- * less than this amount of free space, most ZPL operations (e.g. write,
- * create) will return ENOSPC.
+ * It also limits the worst-case time to allocate space.  If we have less than
+ * this amount of free space, most ZPL operations (e.g. write, create) will
+ * return ENOSPC.  The ZIL metaslabs (spa_embedded_log_class) are also part of
+ * this 3.2% of space which can't be consumed by normal writes; the slop space
+ * "proper" (spa_get_slop_space()) is decreased by the embedded log space.
  *
  * Certain operations (e.g. file removal, most administrative actions) can
  * use half the slop space.  They will only return ENOSPC if less than half
@@ -1026,10 +1028,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
 /*
  * Spares are tracked globally due to the following constraints:
  *
- * 	- A spare may be part of multiple pools.
- * 	- A spare may be added to a pool even if it's actively in use within
+ *	- A spare may be part of multiple pools.
+ *	- A spare may be added to a pool even if it's actively in use within
  *	  another pool.
- * 	- A spare in use in any pool can only be the source of a replacement if
+ *	- A spare in use in any pool can only be the source of a replacement if
  *	  the target is a spare in the same pool.
  *
  * We keep track of all spares on the system through the use of a reference
@@ -1236,6 +1238,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 	 */
 	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
 
@@ -1776,17 +1779,37 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 }
 
 /*
- * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
- * or at least 128MB, unless that would cause it to be more than half the
- * pool size.
- *
- * See the comment above spa_slop_shift for details.
+ * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
+ * (3.2%), minus the embedded log space.  On very small pools, it may be
+ * slightly larger than this. The embedded log space is not included in
+ * spa_dspace.  By subtracting it, the usable space (per "zfs list") is a
+ * constant 97% of the total space, regardless of metaslab size (assuming the
+ * default spa_slop_shift=5 and a non-tiny pool).
+ *
+ * See the comment above spa_slop_shift for more details.
  */
 uint64_t
 spa_get_slop_space(spa_t *spa)
 {
 	uint64_t space = spa_get_dspace(spa);
-	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
+	uint64_t slop = space >> spa_slop_shift;
+
+	/*
+	 * Subtract the embedded log space, but no more than half the (3.2%)
+	 * unusable space.  Note, the "no more than half" is only relevant if
+	 * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
+	 * default.
+	 */
+	uint64_t embedded_log =
+	    metaslab_class_get_dspace(spa_embedded_log_class(spa));
+	slop -= MIN(embedded_log, slop >> 1);
+
+	/*
+	 * Slop space should be at least spa_min_slop, but no more than half
+	 * the entire pool.
+	 */
+	slop = MAX(slop, MIN(space >> 1, spa_min_slop));
+	return (slop);
 }
 
 uint64_t
@@ -1873,6 +1896,12 @@ spa_log_class(spa_t *spa)
 }
 
 metaslab_class_t *
+spa_embedded_log_class(spa_t *spa)
+{
+	return (spa->spa_embedded_log_class);
+}
+
+metaslab_class_t *
 spa_special_class(spa_t *spa)
 {
 	return (spa->spa_special_class);
@@ -1891,12 +1920,10 @@ metaslab_class_t *
 spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
     uint_t level, uint_t special_smallblk)
 {
-	if (DMU_OT_IS_ZIL(objtype)) {
-		if (spa->spa_log_class->mc_groups != 0)
-			return (spa_log_class(spa));
-		else
-			return (spa_normal_class(spa));
-	}
+	/*
+	 * ZIL allocations determine their class in zio_alloc_zil().
+	 */
+	ASSERT(objtype != DMU_OT_INTENT_LOG);
 
 	boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
 
@@ -2432,9 +2459,9 @@ spa_fini(void)
 }
 
 /*
- * Return whether this pool has slogs. No locking needed.
+ * Return whether this pool has a dedicated slog device. No locking needed.
  * It's not a problem if the wrong answer is returned as it's only for
- * performance and not correctness
+ * performance and not correctness.
  */
 boolean_t
 spa_has_slogs(spa_t *spa)
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 7ffe92421..f305da6f5 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -59,6 +59,27 @@
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class.  The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added).  See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations.  Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev.  This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL.  If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
 /* default target for number of metaslabs per top-level vdev */
 int zfs_vdev_default_ms_count = 200;
 
@@ -223,6 +244,22 @@ vdev_getops(const char *type)
 	return (ops);
 }
 
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+	    vd->vdev_log_mg != NULL)
+		return (vd->vdev_log_mg);
+	else
+		return (vd->vdev_mg);
+}
+
 /* ARGSUSED */
 void
 vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
@@ -978,6 +1015,11 @@ vdev_free(vdev_t *vd)
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
+	if (vd->vdev_log_mg != NULL) {
+		ASSERT0(vd->vdev_ms_count);
+		metaslab_group_destroy(vd->vdev_log_mg);
+		vd->vdev_log_mg = NULL;
+	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
@@ -1098,14 +1140,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+	if (tvd->vdev_log_mg)
+		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
 	tvd->vdev_mg = svd->vdev_mg;
+	tvd->vdev_log_mg = svd->vdev_log_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
+	svd->vdev_log_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
+	if (tvd->vdev_log_mg != NULL)
+		tvd->vdev_log_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
@@ -1283,7 +1331,7 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }
 
-static void
+void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
@@ -1317,6 +1365,11 @@ vdev_metaslab_group_create(vdev_t *vd)
 		vd->vdev_mg = metaslab_group_create(mc, vd,
 		    spa->spa_alloc_count);
 
+		if (!vd->vdev_islog) {
+			vd->vdev_log_mg = metaslab_group_create(
+			    spa_embedded_log_class(spa), vd, 1);
+		}
+
 		/*
 		 * The spa ashift min/max only apply for the normal metaslab
 		 * class. Class destination is late binding so ashift boundry
@@ -1340,8 +1393,6 @@ int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
@@ -1369,16 +1420,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
-	for (m = oldc; m < newc; m++) {
-		uint64_t object = 0;
 
+	for (uint64_t m = oldc; m < newc; m++) {
+		uint64_t object = 0;
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
-			error = dmu_read(mos, vd->vdev_ms_array,
+			error = dmu_read(spa->spa_meta_objset,
+			    vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
@@ -1388,17 +1440,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 			}
 		}
 
-#ifndef _KERNEL
-		/*
-		 * To accommodate zdb_leak_init() fake indirect
-		 * metaslabs, we allocate a metaslab group for
-		 * indirect vdevs which normally don't have one.
-		 */
-		if (vd->vdev_mg == NULL) {
-			ASSERT0(vdev_is_concrete(vd));
-			vdev_metaslab_group_create(vd);
-		}
-#endif
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
@@ -1408,6 +1449,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		}
 	}
 
+	/*
+	 * Find the emptiest metaslab on the vdev and mark it for use for
+	 * embedded slog by moving it from the regular to the log metaslab
+	 * group.
+	 */
+	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+		uint64_t slog_msid = 0;
+		uint64_t smallest = UINT64_MAX;
+
+		/*
+		 * Note, we only search the new metaslabs, because the old
+		 * (pre-existing) ones may be active (e.g. have non-empty
+		 * range_tree's), and we don't move them to the new
+		 * metaslab_t.
+		 */
+		for (uint64_t m = oldc; m < newc; m++) {
+			uint64_t alloc =
+			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
+			if (alloc < smallest) {
+				slog_msid = m;
+				smallest = alloc;
+			}
+		}
+		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+		/*
+		 * The metaslab was marked as dirty at the end of
+		 * metaslab_init(). Remove it from the dirty list so that we
+		 * can uninitialize and reinitialize it to the new class.
+		 */
+		if (txg != 0) {
+			(void) txg_list_remove_this(&vd->vdev_ms_list,
+			    slog_ms, txg);
+		}
+		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+		metaslab_fini(slog_ms);
+		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+		    &vd->vdev_ms[slog_msid]));
+	}
+
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
@@ -1418,6 +1500,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	 */
 	if (!expanding && !vd->vdev_removing) {
 		metaslab_group_activate(vd->vdev_mg);
+		if (vd->vdev_log_mg != NULL)
+			metaslab_group_activate(vd->vdev_log_mg);
 	}
 
 	if (txg == 0)
@@ -1453,7 +1537,12 @@ vdev_metaslab_fini(vdev_t *vd)
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
+
 		metaslab_group_passivate(mg);
+		if (vd->vdev_log_mg != NULL) {
+			ASSERT(!vd->vdev_islog);
+			metaslab_group_passivate(vd->vdev_log_mg);
+		}
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
@@ -1463,11 +1552,13 @@ vdev_metaslab_fini(vdev_t *vd)
 		}
 		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
-
 		vd->vdev_ms_count = 0;
 
-		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 			ASSERT0(mg->mg_histogram[i]);
+			if (vd->vdev_log_mg != NULL)
+				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+		}
 	}
 	ASSERT0(vd->vdev_ms_count);
 	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
@@ -3531,8 +3622,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
-	if (reassess)
+	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);
+		if (vd->vdev_log_mg != NULL)
+			metaslab_sync_reassess(vd->vdev_log_mg);
+	}
 }
 
 void
@@ -3856,6 +3950,7 @@ top:
 			/*
 			 * Prevent any future allocations.
 			 */
+			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
@@ -4256,6 +4351,12 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 		 */
 		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 		    vdev_is_concrete(vd)) {
+			/*
+			 * The vdev fragmentation rating doesn't take into
+			 * account the embedded slog metaslab (vdev_log_mg).
+			 * Since it's only one metaslab, it would have a tiny
+			 * impact on the overall fragmentation.
+			 */
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
@@ -5234,6 +5335,9 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+	"Minimum number of metaslabs required to dedicate one for log blocks");
+
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
 	param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 6eaaddd39..a758fe4fb 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1207,6 +1207,11 @@ vdev_remove_complete(spa_t *spa)
 		vd->vdev_mg = NULL;
 		spa_log_sm_set_blocklimit(spa);
 	}
+	if (vd->vdev_log_mg != NULL) {
+		ASSERT0(vd->vdev_ms_count);
+		metaslab_group_destroy(vd->vdev_log_mg);
+		vd->vdev_log_mg = NULL;
+	}
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 
@@ -1780,6 +1785,8 @@ spa_vdev_remove_cancel_impl(spa_t *spa)
 		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
 		vdev_t *vd = vdev_lookup_top(spa, vdid);
 		metaslab_group_activate(vd->vdev_mg);
+		ASSERT(!vd->vdev_islog);
+		metaslab_group_activate(vd->vdev_log_mg);
 		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
 	}
 
@@ -1858,6 +1865,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
+	ASSERT3P(vd->vdev_log_mg, ==, NULL);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
@@ -1893,6 +1901,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
+		ASSERT3P(vd->vdev_log_mg, ==, NULL);
 		return (error);
 	}
 	ASSERT0(vd->vdev_stat.vs_alloc);
@@ -2121,6 +2130,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 */
 	metaslab_group_t *mg = vd->vdev_mg;
 	metaslab_group_passivate(mg);
+	ASSERT(!vd->vdev_islog);
+	metaslab_group_passivate(vd->vdev_log_mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
@@ -2157,6 +2168,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
+		ASSERT(!vd->vdev_islog);
+		metaslab_group_activate(vd->vdev_log_mg);
 		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 41b347b76..538a2a2cd 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2750,10 +2750,9 @@ zio_write_gang_done(zio_t *zio)
 }
 
 static zio_t *
-zio_write_gang_block(zio_t *pio)
+zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
 	spa_t *spa = pio->io_spa;
-	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
@@ -3470,6 +3469,17 @@ zio_dva_allocate(zio_t *zio)
 		zio->io_metaslab_class = mc;
 	}
 
+	/*
+	 * Try allocating the block in the usual metaslab class.
+	 * If that's full, allocate it in the normal class.
+	 * If that's full, allocate as a gang block,
+	 * and if all are full, the allocation fails (which shouldn't happen).
+	 *
+	 * Note that we do not fall back on embedded slog (ZIL) space, to
+	 * preserve unfragmented slog space, which is critical for decent
+	 * sync write performance.  If a log allocation fails, we will fall
+	 * back to spa_sync() which is abysmal for performance.
+	 */
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
@@ -3489,26 +3499,38 @@ zio_dva_allocate(zio_t *zio)
 			    zio->io_prop.zp_copies, zio->io_allocator, zio);
 			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
 
-			mc = spa_normal_class(spa);
-			VERIFY(metaslab_class_throttle_reserve(mc,
+			VERIFY(metaslab_class_throttle_reserve(
+			    spa_normal_class(spa),
 			    zio->io_prop.zp_copies, zio->io_allocator, zio,
 			    flags | METASLAB_MUST_RESERVE));
-		} else {
-			mc = spa_normal_class(spa);
 		}
-		zio->io_metaslab_class = mc;
+		zio->io_metaslab_class = mc = spa_normal_class(spa);
+		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+			zfs_dbgmsg("%s: metaslab allocation failure, "
+			    "trying normal class: zio %px, size %llu, error %d",
+			    spa_name(spa), zio, zio->io_size, error);
+		}
 
 		error = metaslab_alloc(spa, mc, zio->io_size, bp,
 		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 		    &zio->io_alloc_list, zio, zio->io_allocator);
 	}
 
+	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+			zfs_dbgmsg("%s: metaslab allocation failure, "
+			    "trying ganging: zio %px, size %llu, error %d",
+			    spa_name(spa), zio, zio->io_size, error);
+		}
+		return (zio_write_gang_block(zio, mc));
+	}
 	if (error != 0) {
-		zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
-		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
-		    error);
-		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
-			return (zio_write_gang_block(zio));
+		if (error != ENOSPC ||
+		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
+			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
+			    "size %llu, error %d",
+			    spa_name(spa), zio, zio->io_size, error);
+		}
 		zio->io_error = error;
 	}
 
@@ -3588,15 +3610,18 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
 	int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
 	    spa->spa_alloc_count;
-	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp,
-	    1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
-	if (error == 0) {
-		*slog = TRUE;
-	} else {
-		error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp,
-		    1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
-		if (error == 0)
-			*slog = FALSE;
+	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
+	*slog = (error == 0);
+	if (error != 0) {
+		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
+		    new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, NULL, allocator);
+	}
+	if (error != 0) {
+		error = metaslab_alloc(spa, spa_normal_class(spa), size,
+		    new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, NULL, allocator);
 	}
 	metaslab_trace_fini(&io_alloc_list);
author	Matthew Ahrens <[email protected]>	2021-01-21 15:12:54 -0800
committer	GitHub <[email protected]>	2021-01-21 15:12:54 -0800
commit	aa755b35493a2d27dbbc59f1527c6c93788b28f6 (patch)
tree	a0be60ea189e4b89772e57a7628b6a2dea4bdcce /module/zfs
parent	984362a71ebef106d05a8f68f241c5c610ff6547 (diff)