summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c21
-rw-r--r--include/sys/dmu.h3
-rw-r--r--include/sys/spa.h1
-rw-r--r--include/sys/spa_impl.h1
-rw-r--r--include/sys/vdev.h4
-rw-r--r--include/sys/vdev_impl.h2
-rw-r--r--include/sys/zfs_debug.h1
-rw-r--r--man/man5/zfs-module-parameters.516
-rw-r--r--module/zfs/metaslab.c94
-rw-r--r--module/zfs/spa.c28
-rw-r--r--module/zfs/spa_misc.c67
-rw-r--r--module/zfs/vdev.c144
-rw-r--r--module/zfs/vdev_removal.c13
-rw-r--r--module/zfs/zio.c67
14 files changed, 358 insertions, 104 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index e45bff269..77d7441e8 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -5844,6 +5844,7 @@ zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
* metaslabs. We want to set them up for
* zio_claim().
*/
+ vdev_metaslab_group_create(vd);
VERIFY0(vdev_metaslab_init(vd, 0));
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -5882,6 +5883,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
zcb->zcb_vd_obsolete_counts =
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
@@ -6015,7 +6017,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
vdev_t *rvd = spa->spa_root_vdev;
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
- metaslab_group_t *mg __maybe_unused = vd->vdev_mg;
if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
@@ -6023,7 +6024,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
- ASSERT3P(mg, ==, msp->ms_group);
+ ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
+ spa_embedded_log_class(spa)) ?
+ vd->vdev_log_mg : vd->vdev_mg);
/*
* ms_allocatable has been overloaded
@@ -6230,6 +6233,8 @@ dump_block_stats(spa_t *spa)
zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
+ zcb.zcb_totalasize +=
+ metaslab_class_get_alloc(spa_embedded_log_class(spa));
zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
@@ -6277,6 +6282,7 @@ dump_block_stats(spa_t *spa)
total_alloc = norm_alloc +
metaslab_class_get_alloc(spa_log_class(spa)) +
+ metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
metaslab_class_get_alloc(spa_special_class(spa)) +
metaslab_class_get_alloc(spa_dedup_class(spa)) +
get_unflushed_alloc_space(spa);
@@ -6344,6 +6350,17 @@ dump_block_stats(spa_t *spa)
100.0 * alloc / space);
}
+ if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_embedded_log_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_embedded_log_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Embedded log class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
if (zcb.zcb_embedded_blocks[i] == 0)
continue;
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index ad96b729f..10e29a45c 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -142,9 +142,6 @@ typedef enum dmu_object_byteswap {
#define DMU_OT_IS_DDT(ot) \
((ot) == DMU_OT_DDT_ZAP)
-#define DMU_OT_IS_ZIL(ot) \
- ((ot) == DMU_OT_INTENT_LOG)
-
/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
#define DMU_OT_IS_FILE(ot) \
((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 045431c20..0762ae8a3 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1047,6 +1047,7 @@ extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
extern metaslab_class_t *spa_special_class(spa_t *spa);
extern metaslab_class_t *spa_dedup_class(spa_t *spa);
extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index a3afaef38..7f15fd030 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -226,6 +226,7 @@ struct spa {
boolean_t spa_is_exporting; /* true while exporting pool */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
+ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
metaslab_class_t *spa_special_class; /* special allocation class */
metaslab_class_t *spa_dedup_class; /* dedup allocation class */
uint64_t spa_first_txg; /* first txg after spa_open() */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 7bc72a03d..d1ef6b5b5 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -33,6 +33,7 @@
#include <sys/zio.h>
#include <sys/dmu.h>
#include <sys/space_map.h>
+#include <sys/metaslab.h>
#include <sys/fs/zfs.h>
#ifdef __cplusplus
@@ -113,6 +114,9 @@ extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
vdev_xlate_func_t *func, void *arg);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
+
+extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc);
+
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index fc169842a..1239451bf 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -280,6 +280,7 @@ struct vdev {
uint64_t vdev_ms_shift; /* metaslab size shift */
uint64_t vdev_ms_count; /* number of metaslabs */
metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
@@ -636,6 +637,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
* Other miscellaneous functions
*/
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
+void vdev_metaslab_group_create(vdev_t *vd);
/*
* Vdev ashift optimization tunables
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h
index 89587876f..8b9629fb5 100644
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -56,6 +56,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10)
#define ZFS_DEBUG_TRIM (1 << 11)
#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12)
+#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13)
extern void __set_error(const char *file, const char *func, int line, int err);
extern void __zfs_dbgmsg(char *buf);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 41e8ffa79..8fec44dd3 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -3939,6 +3939,22 @@ Default value: \fB786,432\fR.
.sp
.ne 2
.na
+\fBzfs_embedded_slog_min_ms\fR (int)
+.ad
+.RS 12n
+Usually, one metaslab from each (normal-class) vdev is dedicated for use by
+the ZIL (to log synchronous writes).
+However, if there are fewer than zfs_embedded_slog_min_ms metaslabs in the
+vdev, this functionality is disabled.
+This ensures that we don't set aside an unreasonable amount of space for the
+ZIL.
+.sp
+Default value: \fB64\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzio_deadman_log_all\fR (int)
.ad
.RS 12n
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index bed6bf64c..fdc6922b0 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -524,7 +524,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- metaslab_group_t *mg = tvd->vdev_mg;
+ metaslab_group_t *mg = vdev_get_mg(tvd, mc);
/*
* Skip any holes, uninitialized top-levels, or
@@ -535,12 +535,16 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
continue;
}
+ IMPLY(mg == mg->mg_vd->vdev_log_mg,
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
mc_hist[i] += mg->mg_histogram[i];
}
- for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+ }
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
@@ -1004,16 +1008,22 @@ metaslab_group_initialized(metaslab_group_t *mg)
uint64_t
metaslab_group_get_space(metaslab_group_t *mg)
{
- return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
+ /*
+ * Note that the number of nodes in mg_metaslab_tree may be one less
+ * than vdev_ms_count, due to the embedded log metaslab.
+ */
+ mutex_enter(&mg->mg_lock);
+ uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
+ mutex_exit(&mg->mg_lock);
+ return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
}
void
metaslab_group_histogram_verify(metaslab_group_t *mg)
{
uint64_t *mg_hist;
- vdev_t *vd = mg->mg_vd;
- uint64_t ashift = vd->vdev_ashift;
- int i;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
@@ -1024,21 +1034,25 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
SPACE_MAP_HISTOGRAM_SIZE + ashift);
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
-
- /* skip if not active or not a member */
- if (msp->ms_sm == NULL || msp->ms_group != mg)
+ mutex_enter(&mg->mg_lock);
+ for (metaslab_t *msp = avl_first(t);
+ msp != NULL; msp = AVL_NEXT(t, msp)) {
+ VERIFY3P(msp->ms_group, ==, mg);
+ /* skip if not active */
+ if (msp->ms_sm == NULL)
continue;
- for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
mg_hist[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
+ }
}
- for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+ mutex_exit(&mg->mg_lock);
+
kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
@@ -1054,6 +1068,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
mutex_enter(&mg->mg_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ IMPLY(mg == mg->mg_vd->vdev_log_mg,
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] +=
@@ -1078,6 +1094,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
msp->ms_sm->sm_phys->smp_histogram[i]);
ASSERT3U(mc->mc_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
+ IMPLY(mg == mg->mg_vd->vdev_log_mg,
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
@@ -2741,37 +2759,47 @@ metaslab_fini(metaslab_t *msp)
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
- metaslab_space_update(vd, mg->mg_class,
- -metaslab_allocated_space(msp), 0, -msp->ms_size);
+ /*
+ * If the range trees haven't been allocated, this metaslab hasn't
+ * been through metaslab_sync_done() for the first time yet, so its
+ * space hasn't been accounted for in its vdev and doesn't need to be
+ * subtracted.
+ */
+ if (msp->ms_freed != NULL) {
+ metaslab_space_update(vd, mg->mg_class,
+ -metaslab_allocated_space(msp), 0, -msp->ms_size);
+ }
space_map_close(msp->ms_sm);
msp->ms_sm = NULL;
metaslab_unload(msp);
+
range_tree_destroy(msp->ms_allocatable);
- range_tree_destroy(msp->ms_freeing);
- range_tree_destroy(msp->ms_freed);
- ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
- metaslab_unflushed_changes_memused(msp));
- spa->spa_unflushed_stats.sus_memused -=
- metaslab_unflushed_changes_memused(msp);
- range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
- range_tree_destroy(msp->ms_unflushed_allocs);
- range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
- range_tree_destroy(msp->ms_unflushed_frees);
+ if (msp->ms_freed != NULL) {
+ range_tree_destroy(msp->ms_freeing);
+ range_tree_destroy(msp->ms_freed);
- for (int t = 0; t < TXG_SIZE; t++) {
- range_tree_destroy(msp->ms_allocating[t]);
- }
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_allocs);
+ range_tree_destroy(msp->ms_checkpointing);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_frees);
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_destroy(msp->ms_defer[t]);
+ for (int t = 0; t < TXG_SIZE; t++) {
+ range_tree_destroy(msp->ms_allocating[t]);
+ }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_destroy(msp->ms_defer[t]);
+ }
}
ASSERT0(msp->ms_deferspace);
- range_tree_destroy(msp->ms_checkpointing);
-
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
@@ -5113,7 +5141,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* all else fails.
*/
if (vd != NULL && vd->vdev_mg != NULL) {
- mg = vd->vdev_mg;
+ mg = vdev_get_mg(vd, mc);
if (flags & METASLAB_HINTBP_AVOID &&
mg->mg_next != NULL)
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 53ffbc31c..57a492993 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -303,10 +303,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
alloc = metaslab_class_get_alloc(mc);
alloc += metaslab_class_get_alloc(spa_special_class(spa));
alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+ alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
size = metaslab_class_get_space(mc);
size += metaslab_class_get_space(spa_special_class(spa));
size += metaslab_class_get_space(spa_dedup_class(spa));
+ size += metaslab_class_get_space(spa_embedded_log_class(spa));
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
@@ -1196,6 +1198,8 @@ spa_activate(spa_t *spa, spa_mode_t mode)
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_embedded_log_class =
+ metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
@@ -1347,6 +1351,9 @@ spa_deactivate(spa_t *spa)
metaslab_class_destroy(spa->spa_log_class);
spa->spa_log_class = NULL;
+ metaslab_class_destroy(spa->spa_embedded_log_class);
+ spa->spa_embedded_log_class = NULL;
+
metaslab_class_destroy(spa->spa_special_class);
spa->spa_special_class = NULL;
@@ -2103,6 +2110,9 @@ spa_check_logs(spa_t *spa)
return (rv);
}
+/*
+ * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
static boolean_t
spa_passivate_log(spa_t *spa)
{
@@ -2113,10 +2123,10 @@ spa_passivate_log(spa_t *spa)
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- metaslab_group_t *mg = tvd->vdev_mg;
if (tvd->vdev_islog) {
- metaslab_group_passivate(mg);
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+ metaslab_group_passivate(tvd->vdev_mg);
slog_found = B_TRUE;
}
}
@@ -2124,6 +2134,9 @@ spa_passivate_log(spa_t *spa)
return (slog_found);
}
+/*
+ * Activate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
static void
spa_activate_log(spa_t *spa)
{
@@ -2133,10 +2146,11 @@ spa_activate_log(spa_t *spa)
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- metaslab_group_t *mg = tvd->vdev_mg;
- if (tvd->vdev_islog)
- metaslab_group_activate(mg);
+ if (tvd->vdev_islog) {
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+ metaslab_group_activate(tvd->vdev_mg);
+ }
}
}
@@ -8033,12 +8047,16 @@ spa_async_thread(void *arg)
old_space = metaslab_class_get_space(spa_normal_class(spa));
old_space += metaslab_class_get_space(spa_special_class(spa));
old_space += metaslab_class_get_space(spa_dedup_class(spa));
+ old_space += metaslab_class_get_space(
+ spa_embedded_log_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
new_space = metaslab_class_get_space(spa_normal_class(spa));
new_space += metaslab_class_get_space(spa_special_class(spa));
new_space += metaslab_class_get_space(spa_dedup_class(spa));
+ new_space += metaslab_class_get_space(
+ spa_embedded_log_class(spa));
mutex_exit(&spa_namespace_lock);
/*
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index f49be8eec..b4c73f58d 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -349,9 +349,11 @@ int spa_asize_inflation = 24;
* Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
* the pool to be consumed. This ensures that we don't run the pool
* completely out of space, due to unaccounted changes (e.g. to the MOS).
- * It also limits the worst-case time to allocate space. If we have
- * less than this amount of free space, most ZPL operations (e.g. write,
- * create) will return ENOSPC.
+ * It also limits the worst-case time to allocate space. If we have less than
+ * this amount of free space, most ZPL operations (e.g. write, create) will
+ * return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are also part of
+ * this 3.2% of space which can't be consumed by normal writes; the slop space
+ * "proper" (spa_get_slop_space()) is decreased by the embedded log space.
*
* Certain operations (e.g. file removal, most administrative actions) can
* use half the slop space. They will only return ENOSPC if less than half
@@ -1026,10 +1028,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
/*
* Spares are tracked globally due to the following constraints:
*
- * - A spare may be part of multiple pools.
- * - A spare may be added to a pool even if it's actively in use within
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
* another pool.
- * - A spare in use in any pool can only be the source of a replacement if
+ * - A spare in use in any pool can only be the source of a replacement if
* the target is a spare in the same pool.
*
* We keep track of all spares on the system through the use of a reference
@@ -1236,6 +1238,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
*/
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
@@ -1776,17 +1779,37 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
}
/*
- * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
- * or at least 128MB, unless that would cause it to be more than half the
- * pool size.
- *
- * See the comment above spa_slop_shift for details.
+ * Return the amount of slop space in bytes. It is typically 1/32 of the pool
+ * (3.2%), minus the embedded log space. On very small pools, it may be
+ * slightly larger than this. The embedded log space is not included in
+ * spa_dspace. By subtracting it, the usable space (per "zfs list") is a
+ * constant 97% of the total space, regardless of metaslab size (assuming the
+ * default spa_slop_shift=5 and a non-tiny pool).
+ *
+ * See the comment above spa_slop_shift for more details.
*/
uint64_t
spa_get_slop_space(spa_t *spa)
{
uint64_t space = spa_get_dspace(spa);
- return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
+ uint64_t slop = space >> spa_slop_shift;
+
+ /*
+ * Subtract the embedded log space, but no more than half the (3.2%)
+ * unusable space. Note, the "no more than half" is only relevant if
+ * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
+ * default.
+ */
+ uint64_t embedded_log =
+ metaslab_class_get_dspace(spa_embedded_log_class(spa));
+ slop -= MIN(embedded_log, slop >> 1);
+
+ /*
+ * Slop space should be at least spa_min_slop, but no more than half
+ * the entire pool.
+ */
+ slop = MAX(slop, MIN(space >> 1, spa_min_slop));
+ return (slop);
}
uint64_t
@@ -1873,6 +1896,12 @@ spa_log_class(spa_t *spa)
}
metaslab_class_t *
+spa_embedded_log_class(spa_t *spa)
+{
+ return (spa->spa_embedded_log_class);
+}
+
+metaslab_class_t *
spa_special_class(spa_t *spa)
{
return (spa->spa_special_class);
@@ -1891,12 +1920,10 @@ metaslab_class_t *
spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
uint_t level, uint_t special_smallblk)
{
- if (DMU_OT_IS_ZIL(objtype)) {
- if (spa->spa_log_class->mc_groups != 0)
- return (spa_log_class(spa));
- else
- return (spa_normal_class(spa));
- }
+ /*
+ * ZIL allocations determine their class in zio_alloc_zil().
+ */
+ ASSERT(objtype != DMU_OT_INTENT_LOG);
boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
@@ -2432,9 +2459,9 @@ spa_fini(void)
}
/*
- * Return whether this pool has slogs. No locking needed.
+ * Return whether this pool has a dedicated slog device. No locking needed.
* It's not a problem if the wrong answer is returned as it's only for
- * performance and not correctness
+ * performance and not correctness.
*/
boolean_t
spa_has_slogs(spa_t *spa)
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 7ffe92421..f305da6f5 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -59,6 +59,27 @@
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL. These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class. The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added). See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations. Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev. This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL. If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
/* default target for number of metaslabs per top-level vdev */
int zfs_vdev_default_ms_count = 200;
@@ -223,6 +244,22 @@ vdev_getops(const char *type)
return (ops);
}
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+ if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+ vd->vdev_log_mg != NULL)
+ return (vd->vdev_log_mg);
+ else
+ return (vd->vdev_mg);
+}
+
/* ARGSUSED */
void
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
@@ -978,6 +1015,11 @@ vdev_free(vdev_t *vd)
metaslab_group_destroy(vd->vdev_mg);
vd->vdev_mg = NULL;
}
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT0(vd->vdev_ms_count);
+ metaslab_group_destroy(vd->vdev_log_mg);
+ vd->vdev_log_mg = NULL;
+ }
ASSERT0(vd->vdev_stat.vs_space);
ASSERT0(vd->vdev_stat.vs_dspace);
@@ -1098,14 +1140,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
if (tvd->vdev_mg)
ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+ if (tvd->vdev_log_mg)
+ ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_log_mg = svd->vdev_log_mg;
tvd->vdev_ms = svd->vdev_ms;
svd->vdev_mg = NULL;
+ svd->vdev_log_mg = NULL;
svd->vdev_ms = NULL;
if (tvd->vdev_mg != NULL)
tvd->vdev_mg->mg_vd = tvd;
+ if (tvd->vdev_log_mg != NULL)
+ tvd->vdev_log_mg->mg_vd = tvd;
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
svd->vdev_checkpoint_sm = NULL;
@@ -1283,7 +1331,7 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
-static void
+void
vdev_metaslab_group_create(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
@@ -1317,6 +1365,11 @@ vdev_metaslab_group_create(vdev_t *vd)
vd->vdev_mg = metaslab_group_create(mc, vd,
spa->spa_alloc_count);
+ if (!vd->vdev_islog) {
+ vd->vdev_log_mg = metaslab_group_create(
+ spa_embedded_log_class(spa), vd, 1);
+ }
+
/*
* The spa ashift min/max only apply for the normal metaslab
* class. Class destination is late binding so ashift boundry
@@ -1340,8 +1393,6 @@ int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
- uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
@@ -1369,16 +1420,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
vd->vdev_ms = mspp;
vd->vdev_ms_count = newc;
- for (m = oldc; m < newc; m++) {
- uint64_t object = 0;
+ for (uint64_t m = oldc; m < newc; m++) {
+ uint64_t object = 0;
/*
* vdev_ms_array may be 0 if we are creating the "fake"
* metaslabs for an indirect vdev for zdb's leak detection.
* See zdb_leak_init().
*/
if (txg == 0 && vd->vdev_ms_array != 0) {
- error = dmu_read(mos, vd->vdev_ms_array,
+ error = dmu_read(spa->spa_meta_objset,
+ vd->vdev_ms_array,
m * sizeof (uint64_t), sizeof (uint64_t), &object,
DMU_READ_PREFETCH);
if (error != 0) {
@@ -1388,17 +1440,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
}
}
-#ifndef _KERNEL
- /*
- * To accommodate zdb_leak_init() fake indirect
- * metaslabs, we allocate a metaslab group for
- * indirect vdevs which normally don't have one.
- */
- if (vd->vdev_mg == NULL) {
- ASSERT0(vdev_is_concrete(vd));
- vdev_metaslab_group_create(vd);
- }
-#endif
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error != 0) {
@@ -1408,6 +1449,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
}
}
+ /*
+ * Find the emptiest metaslab on the vdev and mark it for use for
+ * embedded slog by moving it from the regular to the log metaslab
+ * group.
+ */
+ if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+ vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+ avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+ uint64_t slog_msid = 0;
+ uint64_t smallest = UINT64_MAX;
+
+ /*
+ * Note, we only search the new metaslabs, because the old
+ * (pre-existing) ones may be active (e.g. have non-empty
+ * range_tree's), and we don't move them to the new
+ * metaslab_t.
+ */
+ for (uint64_t m = oldc; m < newc; m++) {
+ uint64_t alloc =
+ space_map_allocated(vd->vdev_ms[m]->ms_sm);
+ if (alloc < smallest) {
+ slog_msid = m;
+ smallest = alloc;
+ }
+ }
+ metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+ /*
+ * The metaslab was marked as dirty at the end of
+ * metaslab_init(). Remove it from the dirty list so that we
+ * can uninitialize and reinitialize it to the new class.
+ */
+ if (txg != 0) {
+ (void) txg_list_remove_this(&vd->vdev_ms_list,
+ slog_ms, txg);
+ }
+ uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+ metaslab_fini(slog_ms);
+ VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+ &vd->vdev_ms[slog_msid]));
+ }
+
if (txg == 0)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
@@ -1418,6 +1500,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
*/
if (!expanding && !vd->vdev_removing) {
metaslab_group_activate(vd->vdev_mg);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_group_activate(vd->vdev_log_mg);
}
if (txg == 0)
@@ -1453,7 +1537,12 @@ vdev_metaslab_fini(vdev_t *vd)
if (vd->vdev_ms != NULL) {
metaslab_group_t *mg = vd->vdev_mg;
+
metaslab_group_passivate(mg);
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_passivate(vd->vdev_log_mg);
+ }
uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
@@ -1463,11 +1552,13 @@ vdev_metaslab_fini(vdev_t *vd)
}
vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
vd->vdev_ms = NULL;
-
vd->vdev_ms_count = 0;
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
ASSERT0(mg->mg_histogram[i]);
+ if (vd->vdev_log_mg != NULL)
+ ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+ }
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
@@ -3531,8 +3622,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
!= NULL)
metaslab_sync_done(msp, txg);
- if (reassess)
+ if (reassess) {
metaslab_sync_reassess(vd->vdev_mg);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_sync_reassess(vd->vdev_log_mg);
+ }
}
void
@@ -3856,6 +3950,7 @@ top:
/*
* Prevent any future allocations.
*/
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
metaslab_group_passivate(mg);
(void) spa_vdev_state_exit(spa, vd, 0);
@@ -4256,6 +4351,12 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
*/
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
+ /*
+ * The vdev fragmentation rating doesn't take into
+ * account the embedded slog metaslab (vdev_log_mg).
+ * Since it's only one metaslab, it would have a tiny
+ * impact on the overall fragmentation.
+ */
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
@@ -5234,6 +5335,9 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
"Disable cache flushes");
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+ "Minimum number of metaslabs required to dedicate one for log blocks");
+
ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
"Minimum ashift used when creating new top-level vdevs");
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 6eaaddd39..a758fe4fb 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1207,6 +1207,11 @@ vdev_remove_complete(spa_t *spa)
vd->vdev_mg = NULL;
spa_log_sm_set_blocklimit(spa);
}
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT0(vd->vdev_ms_count);
+ metaslab_group_destroy(vd->vdev_log_mg);
+ vd->vdev_log_mg = NULL;
+ }
ASSERT0(vd->vdev_stat.vs_space);
ASSERT0(vd->vdev_stat.vs_dspace);
@@ -1780,6 +1785,8 @@ spa_vdev_remove_cancel_impl(spa_t *spa)
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
vdev_t *vd = vdev_lookup_top(spa, vdid);
metaslab_group_activate(vd->vdev_mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_activate(vd->vdev_log_mg);
spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
}
@@ -1858,6 +1865,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
ASSERT(vd->vdev_islog);
ASSERT(vd == vd->vdev_top);
+ ASSERT3P(vd->vdev_log_mg, ==, NULL);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
@@ -1893,6 +1901,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
if (error != 0) {
metaslab_group_activate(mg);
+ ASSERT3P(vd->vdev_log_mg, ==, NULL);
return (error);
}
ASSERT0(vd->vdev_stat.vs_alloc);
@@ -2121,6 +2130,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
*/
metaslab_group_t *mg = vd->vdev_mg;
metaslab_group_passivate(mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_passivate(vd->vdev_log_mg);
/*
* Wait for the youngest allocations and frees to sync,
@@ -2157,6 +2168,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
if (error != 0) {
metaslab_group_activate(mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_activate(vd->vdev_log_mg);
spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 41b347b76..538a2a2cd 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2750,10 +2750,9 @@ zio_write_gang_done(zio_t *zio)
}
static zio_t *
-zio_write_gang_block(zio_t *pio)
+zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
{
spa_t *spa = pio->io_spa;
- metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = pio->io_bp;
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@@ -3470,6 +3469,17 @@ zio_dva_allocate(zio_t *zio)
zio->io_metaslab_class = mc;
}
+ /*
+ * Try allocating the block in the usual metaslab class.
+ * If that's full, allocate it in the normal class.
+ * If that's full, allocate as a gang block,
+ * and if all are full, the allocation fails (which shouldn't happen).
+ *
+ * Note that we do not fall back on embedded slog (ZIL) space, to
+ * preserve unfragmented slog space, which is critical for decent
+ * sync write performance. If a log allocation fails, we will fall
+ * back to spa_sync() which is abysmal for performance.
+ */
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
@@ -3489,26 +3499,38 @@ zio_dva_allocate(zio_t *zio)
zio->io_prop.zp_copies, zio->io_allocator, zio);
zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
- mc = spa_normal_class(spa);
- VERIFY(metaslab_class_throttle_reserve(mc,
+ VERIFY(metaslab_class_throttle_reserve(
+ spa_normal_class(spa),
zio->io_prop.zp_copies, zio->io_allocator, zio,
flags | METASLAB_MUST_RESERVE));
- } else {
- mc = spa_normal_class(spa);
}
- zio->io_metaslab_class = mc;
+ zio->io_metaslab_class = mc = spa_normal_class(spa);
+ if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+ zfs_dbgmsg("%s: metaslab allocation failure, "
+ "trying normal class: zio %px, size %llu, error %d",
+ spa_name(spa), zio, zio->io_size, error);
+ }
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
}
+ if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+ if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+ zfs_dbgmsg("%s: metaslab allocation failure, "
+ "trying ganging: zio %px, size %llu, error %d",
+ spa_name(spa), zio, zio->io_size, error);
+ }
+ return (zio_write_gang_block(zio, mc));
+ }
if (error != 0) {
- zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
- "size %llu, error %d", spa_name(spa), zio, zio->io_size,
- error);
- if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
- return (zio_write_gang_block(zio));
+ if (error != ENOSPC ||
+ (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
+ zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
+ "size %llu, error %d",
+ spa_name(spa), zio, zio->io_size, error);
+ }
zio->io_error = error;
}
@@ -3588,15 +3610,18 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
spa->spa_alloc_count;
- error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp,
- 1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
- if (error == 0) {
- *slog = TRUE;
- } else {
- error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp,
- 1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
- if (error == 0)
- *slog = FALSE;
+ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+ txg, NULL, flags, &io_alloc_list, NULL, allocator);
+ *slog = (error == 0);
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
+ new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, NULL, allocator);
+ }
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_normal_class(spa), size,
+ new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, NULL, allocator);
}
metaslab_trace_fini(&io_alloc_list);