From 492f64e941e3d6b947d1cc387a1a380c0c738b09 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 12 Feb 2018 12:56:06 -0800 Subject: OpenZFS 9112 - Improve allocation performance on high-end systems Overview ======== We parallelize the allocation process by creating the concept of "allocators". There are a certain number of allocators per metaslab group, defined by the value of a tunable at pool open time. Each allocator for a given metaslab group has up to 2 active metaslabs; one "primary", and one "secondary". The primary and secondary weight mean the same thing they did in in the pre-allocator world; primary metaslabs are used for most allocations, secondary metaslabs are used for ditto blocks being allocated in the same metaslab group. There is also the CLAIM weight, which has been separated out from the other weights, but that is less important to understanding the patch. The active metaslabs for each allocator are moved from their normal place in the metaslab tree for the group to the back of the tree. This way, they will not be selected for use by other allocators searching for new metaslabs unless all the passive metaslabs are unsuitable for allocations. If that does happen, the allocators will "steal" from each other to ensure that IOs don't fail until there is truly no space left to perform allocations. In addition, the alloc queue for each metaslab group has been broken into a separate queue for each allocator. We don't want to dramatically increase the number of inflight IOs on low-end systems, because it can significantly increase txg times. On the other hand, we want to ensure that there are enough IOs for each allocator to allow for good coalescing before sending the IOs to the disk. As a result, we take a compromise path; each allocator's alloc queue max depth starts at a certain value for every txg. Every time an IO completes, we increase the max depth. This should hopefully provide a good balance between the two failure modes, while not dramatically increasing complexity. We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause very similar contention when selecting IOs to allocate. This parallelization uses the same allocator scheme as metaslab selection. Performance Results =================== Performance improvements from this change can vary significantly based on the number of CPUs in the system, whether or not the system has a NUMA architecture, the speed of the drives, the values for the various tunables, and the workload being performed. For an fio async sequential write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB SSDs, there is a roughly 25% performance improvement. Future Work =========== Analysis of the performance of the system with this patch applied shows that a significant new bottleneck is the vdev disk queues, which also need to be parallelized. Prototyping of this change has occurred, and there was a performance improvement, but more work needs to be done before its stability has been verified and it is ready to be upstreamed. Authored by: Paul Dagnelie Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Serapheim Dimitropoulos Reviewed by: Alexander Motin Reviewed by: Brian Behlendorf Approved by: Gordon Ross Ported-by: Paul Dagnelie Signed-off-by: Paul Dagnelie Porting Notes: * Fix reservation test failures by increasing tolerance. OpenZFS-issue: https://illumos.org/issues/9112 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3 Closes #7682 --- module/zfs/spa.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'module/zfs/spa.c') diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 8ab7c3428..537e19068 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7652,9 +7652,11 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * If there are any pending vdev state changes, convert them @@ -7715,7 +7717,7 @@ spa_sync(spa_t *spa, uint64_t txg) * The max queue depth will not change in the middle of syncing * out this txg. */ - uint64_t queue_depth_total = 0; + uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -7729,18 +7731,23 @@ spa_sync(spa_t *spa, uint64_t txg) * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ - ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; - queue_depth_total += mg->mg_max_alloc_queue_depth; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mg->mg_cur_max_alloc_queue_depth[i] = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; } metaslab_class_t *mc = spa_normal_class(spa); - ASSERT0(refcount_count(&mc->mc_alloc_slots)); - mc->mc_alloc_max_slots = queue_depth_total; + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); + mc->mc_alloc_max_slots[i] = slots_per_allocator; + } mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - ASSERT3U(mc->mc_alloc_max_slots, <=, - max_queue_depth * rvd->vdev_children); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); @@ -7920,9 +7927,11 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * Update usable space statistics. -- cgit v1.2.3