Improve write issue taskqs utilization

- Reduce number of allocators on small system down to one per 4 CPU cores, keeping maximum at 4 on 16+ core systems. Small systems should not have the lock contention multiple allocators supposed to solve, while having several metaslabs open and modified each TXG is not free. - Reduce number of write issue taskqs down to one per 16 CPU cores and an integer fraction of number of allocators. On mid- sized systems, where multiple allocators already make sense, too many write issue taskqs may reduce write speed on single-file workloads, since single file is handled by only one taskq to reduce fragmentation. On large systems, that can actually benefit from many taskq's better IOPS, the bottleneck is less important, since in worst case there will be at least 16 cores to handle it. - Distribute dnodes between allocators (and taskqs) in a round- robin fashion instead of relying on sync taskqs to be balanced. The last is not guarantied and may depend on scheduling. - Remove io_wr_iss_tq from struct zio. io_allocator is enough. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #16130
author: Alexander Motin <[email protected]> 2024-05-01 14:07:20 -0400
committer: GitHub <[email protected]> 2024-05-01 11:07:20 -0700
commit: 645b83307918085ab2f0e12618809e348635b34f (patch)
tree: 50ca528cdfced535d6e3d88999b71bda8d712f58 /module/zfs/spa_misc.c
parent: 8fd3a5d02f3f6bad9e8e65b6aded694eae222bf2 (diff)
1 files changed, 19 insertions, 3 deletions
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 5fb7847b5..e6d4a9bdb 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
  * Number of allocators to use, per spa instance
  */
 static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
 
 /*
  * Spa active allocator.
@@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
-	/* Do not allow more allocators than CPUs. */
-	spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+	/* Do not allow more allocators than fraction of CPUs. */
+	spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+	    boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
 
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
@@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
+	if (spa->spa_alloc_count > 1) {
+		spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+		mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
 
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -853,6 +861,11 @@ spa_remove(spa_t *spa)
 	}
 	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 	    sizeof (spa_alloc_t));
+	if (spa->spa_alloc_count > 1) {
+		mutex_destroy(&spa->spa_allocs_use->sau_lock);
+		kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]));
+	}
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_uint, ZMOD_RW, "Reserved free space in pool");
 
 ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
-	"Number of allocators per spa, capped by ncpus");
+	"Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+	"Minimum number of CPUs per allocators");
author	Alexander Motin <[email protected]>	2024-05-01 14:07:20 -0400
committer	GitHub <[email protected]>	2024-05-01 11:07:20 -0700
commit	645b83307918085ab2f0e12618809e348635b34f (patch)
tree	50ca528cdfced535d6e3d88999b71bda8d712f58 /module/zfs/spa_misc.c
parent	8fd3a5d02f3f6bad9e8e65b6aded694eae222bf2 (diff)