Make metaslab class rotor and aliquot per-allocator.

Metaslab rotor and aliquot are used to distribute workload between vdevs while keeping some locality for logically adjacent blocks. Once multiple allocators were introduced to separate allocation of different objects it does not make much sense for different allocators to write into different metaslabs of the same metaslab group (vdev) same time, competing for its resources. This change makes each allocator choose metaslab group independently, colliding with others only sporadically. Test including simultaneous write into 4 files with recordsize of 4KB on a striped pool of 30 disks on a system with 40 logical cores show reduction of vdev queue lock contention from 54 to 27% due to better load distribution. Unfortunately it won't help much ZVOLs yet since only one dataset/ZVOL is synced at a time, and so for the most part only one allocator is used, but it may improve later. While there, to reduce the number of pointer dereferences change per-allocator storage for metaslab classes and groups from several separate malloc()'s to variable length arrays at the ends of the original class and group structures. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Matthew Ahrens <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Closes #11288
author: Alexander Motin <[email protected]> 2020-12-15 13:55:44 -0500
committer: GitHub <[email protected]> 2020-12-15 10:55:44 -0800
commit: f8020c936356b887ab1e03eba1a723f9dfda6eea (patch)
tree: f58ab8f7b84363d1e58d014d2c4b15a3b468efbe /module
parent: e2d952cda06a649441eba01730753e1a24bf2c83 (diff)
4 files changed, 82 insertions, 76 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index fcf1285f6..ea1720a2c 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -396,20 +396,19 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
-	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+	mc = kmem_zalloc(offsetof(metaslab_class_t,
+	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 
 	mc->mc_spa = spa;
-	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
 	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
-	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (zfs_refcount_t), KM_SLEEP);
-	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (uint64_t), KM_SLEEP);
-	for (int i = 0; i < spa->spa_alloc_count; i++)
-		zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+		mca->mca_rotor = NULL;
+		zfs_refcount_create_tracked(&mca->mca_alloc_slots);
+	}
 
 	return (mc);
 }
@@ -417,21 +416,22 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
-	ASSERT(mc->mc_rotor == NULL);
+	spa_t *spa = mc->mc_spa;
+
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
-	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
-		zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
-	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
-	    sizeof (zfs_refcount_t));
-	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
-	    sizeof (uint64_t));
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+		ASSERT(mca->mca_rotor == NULL);
+		zfs_refcount_destroy(&mca->mca_alloc_slots);
+	}
 	mutex_destroy(&mc->mc_lock);
 	multilist_destroy(mc->mc_metaslab_txg_list);
-	kmem_free(mc, sizeof (metaslab_class_t));
+	kmem_free(mc, offsetof(metaslab_class_t,
+	    mc_allocator[spa->spa_alloc_count]));
 }
 
 int
@@ -446,7 +446,7 @@ metaslab_class_validate(metaslab_class_t *mc)
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
-	if ((mg = mc->mc_rotor) == NULL)
+	if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
 		return (0);
 
 	do {
@@ -455,7 +455,7 @@ metaslab_class_validate(metaslab_class_t *mc)
 		ASSERT3P(vd->vdev_top, ==, vd);
 		ASSERT3P(mg->mg_class, ==, mc);
 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
-	} while ((mg = mg->mg_next) != mc->mc_rotor);
+	} while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
 
 	return (0);
 }
@@ -812,7 +812,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;
 
-	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+	mg = kmem_zalloc(offsetof(metaslab_group_t,
+	    mg_allocator[allocators]), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
@@ -825,8 +826,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 	mg->mg_no_free_space = B_TRUE;
 	mg->mg_allocators = allocators;
 
-	mg->mg_allocator = kmem_zalloc(allocators *
-	    sizeof (metaslab_group_allocator_t), KM_SLEEP);
 	for (int i = 0; i < allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
@@ -860,21 +859,19 @@ metaslab_group_destroy(metaslab_group_t *mg)
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 	}
-	kmem_free(mg->mg_allocator, mg->mg_allocators *
-	    sizeof (metaslab_group_allocator_t));
-
-	kmem_free(mg, sizeof (metaslab_group_t));
+	kmem_free(mg, offsetof(metaslab_group_t,
+	    mg_allocator[mg->mg_allocators]));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
+	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 
-	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
+	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
-	ASSERT(mc->mc_rotor != mg);
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
@@ -885,7 +882,7 @@ metaslab_group_activate(metaslab_group_t *mg)
 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 	metaslab_group_alloc_update(mg);
 
-	if ((mgprev = mc->mc_rotor) == NULL) {
+	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
@@ -895,7 +892,10 @@ metaslab_group_activate(metaslab_group_t *mg)
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
-	mc->mc_rotor = mg;
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mc->mc_allocator[i].mca_rotor = mg;
+		mg = mg->mg_next;
+	}
 }
 
 /*
@@ -916,7 +916,8 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	    (SCL_ALLOC | SCL_ZIO));
 
 	if (--mg->mg_activation_count != 0) {
-		ASSERT(mc->mc_rotor != mg);
+		for (int i = 0; i < spa->spa_alloc_count; i++)
+			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
@@ -963,12 +964,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
-		mc->mc_rotor = NULL;
+		mgnext = NULL;
 	} else {
-		mc->mc_rotor = mgnext;
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		if (mc->mc_allocator[i].mca_rotor == mg)
+			mc->mc_allocator[i].mca_rotor = mgnext;
+	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
@@ -1202,7 +1206,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 	 * in metaslab_group_alloc_update() for more information) and
 	 * the allocation throttle is disabled then allow allocations to this
 	 * device. However, if the allocation throttle is enabled then
-	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
+	 * check if we have reached our allocation limit (mga_alloc_queue_depth)
 	 * to determine if we should allow allocations to this metaslab group.
 	 * If all metaslab groups are no longer considered allocatable
 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
@@ -4517,13 +4521,14 @@ static void
 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	metaslab_class_allocator_t *mca =
+	    &mg->mg_class->mc_allocator[allocator];
 	uint64_t max = mg->mg_max_alloc_queue_depth;
 	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
 	while (cur < max) {
 		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
 		    cur, cur + 1) == cur) {
-			atomic_inc_64(
-			    &mg->mg_class->mc_alloc_max_slots[allocator]);
+			atomic_inc_64(&mca->mca_alloc_max_slots);
 			return;
 		}
 		cur = mga->mga_cur_max_alloc_queue_depth;
@@ -5059,6 +5064,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
     zio_alloc_list_t *zal, int allocator)
 {
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	metaslab_group_t *mg, *fast_mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
@@ -5080,7 +5086,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
-	 * Note that there's no locking on mc_rotor or mc_aliquot because
+	 * Note that there's no locking on mca_rotor or mca_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
@@ -5116,23 +5122,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 			    mg->mg_next != NULL)
 				mg = mg->mg_next;
 		} else {
-			mg = mc->mc_rotor;
+			mg = mca->mca_rotor;
 		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
 	} else if (flags & METASLAB_FASTWRITE) {
-		mg = fast_mg = mc->mc_rotor;
+		mg = fast_mg = mca->mca_rotor;
 
 		do {
 			if (fast_mg->mg_vd->vdev_pending_fastwrite <
 			    mg->mg_vd->vdev_pending_fastwrite)
 				mg = fast_mg;
-		} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
+		} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
 
 	} else {
-		ASSERT(mc->mc_rotor != NULL);
-		mg = mc->mc_rotor;
+		ASSERT(mca->mca_rotor != NULL);
+		mg = mca->mca_rotor;
 	}
 
 	/*
@@ -5140,7 +5146,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * metaslab group that has been passivated, just follow the rotor.
 	 */
 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
-		mg = mc->mc_rotor;
+		mg = mca->mca_rotor;
 
 	rotor = mg;
 top:
@@ -5218,7 +5224,7 @@ top:
 			 * Bias is also used to compensate for unequally
 			 * sized vdevs so that space is allocated fairly.
 			 */
-			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
+			if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vs_free = vs->vs_space - vs->vs_alloc;
 				int64_t mc_free = mc->mc_space - mc->mc_alloc;
@@ -5256,10 +5262,10 @@ top:
 			}
 
 			if ((flags & METASLAB_FASTWRITE) ||
-			    atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
-				mc->mc_rotor = mg->mg_next;
-				mc->mc_aliquot = 0;
+				mca->mca_rotor = mg->mg_next;
+				mca->mca_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -5276,8 +5282,8 @@ top:
 			return (0);
 		}
 next:
-		mc->mc_rotor = mg->mg_next;
-		mc->mc_aliquot = 0;
+		mca->mca_rotor = mg->mg_next;
+		mca->mca_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	/*
@@ -5595,15 +5601,15 @@ boolean_t
 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	uint64_t available_slots = 0;
 	boolean_t slot_reserved = B_FALSE;
-	uint64_t max = mc->mc_alloc_max_slots[allocator];
+	uint64_t max = mca->mca_alloc_max_slots;
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
 
-	uint64_t reserved_slots =
-	    zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
+	uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
 	if (reserved_slots < max)
 		available_slots = max - reserved_slots;
 
@@ -5613,11 +5619,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
 		 */
-		for (int d = 0; d < slots; d++) {
-			reserved_slots =
-			    zfs_refcount_add(&mc->mc_alloc_slots[allocator],
-			    zio);
-		}
+		for (int d = 0; d < slots; d++)
+			zfs_refcount_add(&mca->mca_alloc_slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		slot_reserved = B_TRUE;
 	}
@@ -5630,12 +5633,12 @@ void
 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
     int allocator, zio_t *zio)
 {
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
-	for (int d = 0; d < slots; d++) {
-		(void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
-		    zio);
-	}
+	for (int d = 0; d < slots; d++)
+		zfs_refcount_remove(&mca->mca_alloc_slots, zio);
 	mutex_exit(&mc->mc_lock);
 }
 
@@ -5789,7 +5792,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
-	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
+	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
+		/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 50822cfae..53ffbc31c 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -2111,9 +2111,6 @@ spa_passivate_log(spa_t *spa)
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
-	if (!spa_has_slogs(spa))
-		return (B_FALSE);
-
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
@@ -8883,12 +8880,18 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
-		ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
-		ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
-		normal->mc_alloc_max_slots[i] = slots_per_allocator;
-		special->mc_alloc_max_slots[i] = slots_per_allocator;
-		dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+		ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
+		    mca_alloc_slots));
+		ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
+		    mca_alloc_slots));
+		ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
+		    mca_alloc_slots));
+		normal->mc_allocator[i].mca_alloc_max_slots =
+		    slots_per_allocator;
+		special->mc_allocator[i].mca_alloc_max_slots =
+		    slots_per_allocator;
+		dedup->mc_allocator[i].mca_alloc_max_slots =
+		    slots_per_allocator;
 	}
 	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index c6b3e8c11..633dd1c0c 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2436,7 +2436,7 @@ spa_fini(void)
 boolean_t
 spa_has_slogs(spa_t *spa)
 {
-	return (spa->spa_log_class->mc_rotor != NULL);
+	return (spa->spa_log_class->mc_groups != 0);
 }
 
 spa_log_state_t
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 982940dbd..ba438353a 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2783,8 +2783,8 @@ zio_write_gang_block(zio_t *pio)
 		ASSERT(has_data);
 
 		flags |= METASLAB_ASYNC_ALLOC;
-		VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
-		    pio));
+		VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
+		    mca_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
@@ -4468,9 +4468,8 @@ zio_done(zio_t *zio)
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
-		VERIFY(zfs_refcount_not_held(
-		    &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
-		    zio));
+		VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
+		    mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
 	}
author	Alexander Motin <[email protected]>	2020-12-15 13:55:44 -0500
committer	GitHub <[email protected]>	2020-12-15 10:55:44 -0800
commit	f8020c936356b887ab1e03eba1a723f9dfda6eea (patch)
tree	f58ab8f7b84363d1e58d014d2c4b15a3b468efbe /module
parent	e2d952cda06a649441eba01730753e1a24bf2c83 (diff)