aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorAlexander Motin <[email protected]>2021-07-21 08:40:36 -0400
committerTony Hutter <[email protected]>2021-09-14 12:40:15 -0700
commit32c0b6468cbcfbd6c2c4bc08f88f34e016b4f184 (patch)
treed1c1adb19a5db5faee4aa2f32959900632a8e8b0 /module/zfs
parent7c61e1ef9d9f6c5fa6a3665a88838a19120cf07b (diff)
Optimize allocation throttling
Remove mc_lock use from metaslab_class_throttle_*(). The math there is based on refcounts and so atomic, so the only race possible there is between zfs_refcount_count() and zfs_refcount_add(). But in most cases metaslab_class_throttle_reserve() is called with the allocator lock held, which covers the race. In cases where the lock is not held, GANG_ALLOCATION() or METASLAB_MUST_RESERVE are set, and so we do not use zfs_refcount_count(). And even if we assume some other non-existing scenario, the worst that may happen from this race is few more I/Os get to allocation earlier, that is not a problem. Move locks and data of different allocators into different cache lines to avoid false sharing. Group spa_alloc_* arrays together into single array of aligned struct spa_alloc spa_allocs. Align struct metaslab_class_allocator. Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Reviewed-by: Don Brady <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Closes #12314
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/metaslab.c20
-rw-r--r--module/zfs/spa.c12
-rw-r--r--module/zfs/spa_misc.c21
-rw-r--r--module/zfs/zio.c33
4 files changed, 35 insertions, 51 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 23f3e2989..93d409ceb 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5611,19 +5611,11 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
zio_t *zio, int flags)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
- uint64_t available_slots = 0;
- boolean_t slot_reserved = B_FALSE;
uint64_t max = mca->mca_alloc_max_slots;
ASSERT(mc->mc_alloc_throttle_enabled);
- mutex_enter(&mc->mc_lock);
-
- uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
- if (reserved_slots < max)
- available_slots = max - reserved_slots;
-
- if (slots <= available_slots || GANG_ALLOCATION(flags) ||
- flags & METASLAB_MUST_RESERVE) {
+ if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
+ zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
@@ -5631,11 +5623,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
for (int d = 0; d < slots; d++)
zfs_refcount_add(&mca->mca_alloc_slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
- slot_reserved = B_TRUE;
+ return (B_TRUE);
}
-
- mutex_exit(&mc->mc_lock);
- return (slot_reserved);
+ return (B_FALSE);
}
void
@@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++)
zfs_refcount_remove(&mca->mca_alloc_slots, zio);
- mutex_exit(&mc->mc_lock);
}
static int
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index f6dce076d..2a4db7d56 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -9197,9 +9197,9 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_sync_pass = 0;
for (int i = 0; i < spa->spa_alloc_count; i++) {
- mutex_enter(&spa->spa_alloc_locks[i]);
- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
- mutex_exit(&spa->spa_alloc_locks[i]);
+ mutex_enter(&spa->spa_allocs[i].spaa_lock);
+ VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
+ mutex_exit(&spa->spa_allocs[i].spaa_lock);
}
/*
@@ -9309,9 +9309,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);
for (int i = 0; i < spa->spa_alloc_count; i++) {
- mutex_enter(&spa->spa_alloc_locks[i]);
- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
- mutex_exit(&spa->spa_alloc_locks[i]);
+ mutex_enter(&spa->spa_allocs[i].spaa_lock);
+ VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
+ mutex_exit(&spa->spa_allocs[i].spaa_lock);
}
/*
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 6ffe11a86..add108b77 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -701,13 +701,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_root = spa_strdup(altroot);
spa->spa_alloc_count = spa_allocators;
- spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
- sizeof (kmutex_t), KM_SLEEP);
- spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
- sizeof (avl_tree_t), KM_SLEEP);
+ spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (spa_alloc_t), KM_SLEEP);
for (int i = 0; i < spa->spa_alloc_count; i++) {
- mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
- avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+ mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+ avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
@@ -800,13 +799,11 @@ spa_remove(spa_t *spa)
}
for (int i = 0; i < spa->spa_alloc_count; i++) {
- avl_destroy(&spa->spa_alloc_trees[i]);
- mutex_destroy(&spa->spa_alloc_locks[i]);
+ avl_destroy(&spa->spa_allocs[i].spaa_tree);
+ mutex_destroy(&spa->spa_allocs[i].spaa_lock);
}
- kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
- sizeof (kmutex_t));
- kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
- sizeof (avl_tree_t));
+ kmem_free(spa->spa_allocs, spa->spa_alloc_count *
+ sizeof (spa_alloc_t));
avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6030b3813..76ed4fad4 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
- if (zio->io_metaslab_class == NULL)
- zio->io_metaslab_class = pio->io_metaslab_class;
+ zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -3380,9 +3379,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
{
zio_t *zio;
- ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
+ ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
- zio = avl_first(&spa->spa_alloc_trees[allocator]);
+ zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
if (zio == NULL)
return (NULL);
@@ -3394,11 +3393,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
*/
ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
- zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
+ zio->io_prop.zp_copies, allocator, zio, 0)) {
return (NULL);
}
- avl_remove(&spa->spa_alloc_trees[allocator], zio);
+ avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio);
@@ -3422,8 +3421,8 @@ zio_dva_throttle(zio_t *zio)
return (zio);
}
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
@@ -3435,14 +3434,14 @@ zio_dva_throttle(zio_t *zio)
* into 2^20 block regions, and then hash based on the objset, object,
* level, and region to accomplish both of these goals.
*/
- zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+ int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
- mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_allocator = allocator;
zio->io_metaslab_class = mc;
- avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
- nio = zio_io_to_allocate(spa, zio->io_allocator);
- mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
+ mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
+ avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
+ nio = zio_io_to_allocate(spa, allocator);
+ mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
return (nio);
}
@@ -3451,9 +3450,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
- mutex_enter(&spa->spa_alloc_locks[allocator]);
+ mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
zio = zio_io_to_allocate(spa, allocator);
- mutex_exit(&spa->spa_alloc_locks[allocator]);
+ mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
if (zio == NULL)
return;
@@ -3643,8 +3642,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* some parallelism.
*/
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
- int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
- spa->spa_alloc_count;
+ int allocator = (uint_t)cityhash4(0, 0, 0,
+ os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, NULL, allocator);
*slog = (error == 0);