aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2021-01-21 15:12:54 -0800
committerGitHub <[email protected]>2021-01-21 15:12:54 -0800
commitaa755b35493a2d27dbbc59f1527c6c93788b28f6 (patch)
treea0be60ea189e4b89772e57a7628b6a2dea4bdcce /include/sys
parent984362a71ebef106d05a8f68f241c5c610ff6547 (diff)
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems: 1. The ZIL allocations are allocated, written to disk, and then a few seconds later freed. This leaves behind holes (free segments) where the ZIL blocks used to be, which increases fragmentation, which negatively impacts performance. 2. When under moderate load, ZIL allocations are of 128KB. If the pool is fairly fragmented, there may not be many free chunks of that size. This causes ZFS to load more metaslabs to locate free segments of 128KB or more. The loading happens synchronously (from zil_commit()), and can take around a second even if the metaslab's spacemap is cached in the ARC. All concurrent synchronous operations on this filesystem must wait while the metaslab is loading. This can cause a significant performance impact. 3. If the pool is very fragmented, there may be zero free chunks of 128KB or more. In this case, the ZIL falls back to txg_wait_synced(), which has an enormous performance impact. These problems can be eliminated by using a dedicated log device ("slog"), even one with the same performance characteristics as the normal devices. This change sets aside one metaslab from each top-level vdev that is preferentially used for ZIL allocations (vdev_log_mg, spa_embedded_log_class). From an allocation perspective, this is similar to having a dedicated log device, and it eliminates the above-mentioned performance problems. Log (ZIL) blocks can be allocated from the following locations. Each one is tried in order until the allocation succeeds: 1. dedicated log vdevs, aka "slog" (spa_log_class) 2. embedded slog metaslabs (spa_embedded_log_class) 3. other metaslabs in normal vdevs (spa_normal_class) The space required for the embedded slog metaslabs is usually between 0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop" space that is not available for user data. On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity, and recordsize=8k, testing shows a ~50% performance increase on random 8k sync writes. On even more fragmented systems (which hit problem #3 above and call txg_wait_synced()), the performance improvement can be arbitrarily large (>100x). Reviewed-by: Serapheim Dimitropoulos <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Don Brady <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11389
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/dmu.h3
-rw-r--r--include/sys/spa.h1
-rw-r--r--include/sys/spa_impl.h1
-rw-r--r--include/sys/vdev.h4
-rw-r--r--include/sys/vdev_impl.h2
-rw-r--r--include/sys/zfs_debug.h1
6 files changed, 9 insertions, 3 deletions
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index ad96b729f..10e29a45c 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -142,9 +142,6 @@ typedef enum dmu_object_byteswap {
#define DMU_OT_IS_DDT(ot) \
((ot) == DMU_OT_DDT_ZAP)
-#define DMU_OT_IS_ZIL(ot) \
- ((ot) == DMU_OT_INTENT_LOG)
-
/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
#define DMU_OT_IS_FILE(ot) \
((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 045431c20..0762ae8a3 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1047,6 +1047,7 @@ extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
extern metaslab_class_t *spa_special_class(spa_t *spa);
extern metaslab_class_t *spa_dedup_class(spa_t *spa);
extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index a3afaef38..7f15fd030 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -226,6 +226,7 @@ struct spa {
boolean_t spa_is_exporting; /* true while exporting pool */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
+ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
metaslab_class_t *spa_special_class; /* special allocation class */
metaslab_class_t *spa_dedup_class; /* dedup allocation class */
uint64_t spa_first_txg; /* first txg after spa_open() */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 7bc72a03d..d1ef6b5b5 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -33,6 +33,7 @@
#include <sys/zio.h>
#include <sys/dmu.h>
#include <sys/space_map.h>
+#include <sys/metaslab.h>
#include <sys/fs/zfs.h>
#ifdef __cplusplus
@@ -113,6 +114,9 @@ extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
vdev_xlate_func_t *func, void *arg);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
+
+extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc);
+
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index fc169842a..1239451bf 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -280,6 +280,7 @@ struct vdev {
uint64_t vdev_ms_shift; /* metaslab size shift */
uint64_t vdev_ms_count; /* number of metaslabs */
metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
@@ -636,6 +637,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
* Other miscellaneous functions
*/
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
+void vdev_metaslab_group_create(vdev_t *vd);
/*
* Vdev ashift optimization tunables
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h
index 89587876f..8b9629fb5 100644
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -56,6 +56,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10)
#define ZFS_DEBUG_TRIM (1 << 11)
#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12)
+#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13)
extern void __set_error(const char *file, const char *func, int line, int err);
extern void __zfs_dbgmsg(char *buf);