From aa755b35493a2d27dbbc59f1527c6c93788b28f6 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 21 Jan 2021 15:12:54 -0800 Subject: Set aside a metaslab for ZIL blocks Mixing ZIL and normal allocations has several problems: 1. The ZIL allocations are allocated, written to disk, and then a few seconds later freed. This leaves behind holes (free segments) where the ZIL blocks used to be, which increases fragmentation, which negatively impacts performance. 2. When under moderate load, ZIL allocations are of 128KB. If the pool is fairly fragmented, there may not be many free chunks of that size. This causes ZFS to load more metaslabs to locate free segments of 128KB or more. The loading happens synchronously (from zil_commit()), and can take around a second even if the metaslab's spacemap is cached in the ARC. All concurrent synchronous operations on this filesystem must wait while the metaslab is loading. This can cause a significant performance impact. 3. If the pool is very fragmented, there may be zero free chunks of 128KB or more. In this case, the ZIL falls back to txg_wait_synced(), which has an enormous performance impact. These problems can be eliminated by using a dedicated log device ("slog"), even one with the same performance characteristics as the normal devices. This change sets aside one metaslab from each top-level vdev that is preferentially used for ZIL allocations (vdev_log_mg, spa_embedded_log_class). From an allocation perspective, this is similar to having a dedicated log device, and it eliminates the above-mentioned performance problems. Log (ZIL) blocks can be allocated from the following locations. Each one is tried in order until the allocation succeeds: 1. dedicated log vdevs, aka "slog" (spa_log_class) 2. embedded slog metaslabs (spa_embedded_log_class) 3. other metaslabs in normal vdevs (spa_normal_class) The space required for the embedded slog metaslabs is usually between 0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop" space that is not available for user data. On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity, and recordsize=8k, testing shows a ~50% performance increase on random 8k sync writes. On even more fragmented systems (which hit problem #3 above and call txg_wait_synced()), the performance improvement can be arbitrarily large (>100x). Reviewed-by: Serapheim Dimitropoulos Reviewed-by: George Wilson Reviewed-by: Don Brady Reviewed-by: Mark Maybee Signed-off-by: Matthew Ahrens Closes #11389 --- cmd/zdb/zdb.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'cmd') diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index e45bff269..77d7441e8 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5844,6 +5844,7 @@ zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) * metaslabs. We want to set them up for * zio_claim(). */ + vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; @@ -5882,6 +5883,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -6015,7 +6017,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; - metaslab_group_t *mg __maybe_unused = vd->vdev_mg; if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); @@ -6023,7 +6024,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); + ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == + spa_embedded_log_class(spa)) ? + vd->vdev_log_mg : vd->vdev_mg); /* * ms_allocatable has been overloaded @@ -6230,6 +6233,8 @@ dump_block_stats(spa_t *spa) zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb.zcb_totalasize += + metaslab_class_get_alloc(spa_embedded_log_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); @@ -6277,6 +6282,7 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); @@ -6344,6 +6350,17 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } + if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_embedded_log_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_embedded_log_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Embedded log class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; -- cgit v1.2.3