aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/arc.h1
-rw-r--r--include/sys/metaslab.h6
-rw-r--r--include/sys/metaslab_impl.h12
-rw-r--r--man/man5/zfs-module-parameters.515
-rw-r--r--module/zfs/arc.c3
-rw-r--r--module/zfs/metaslab.c274
-rw-r--r--module/zfs/spa.c4
-rw-r--r--module/zfs/spa_log_spacemap.c1
-rw-r--r--module/zfs/vdev.c14
-rw-r--r--module/zfs/vdev_initialize.c7
-rw-r--r--module/zfs/vdev_trim.c10
11 files changed, 289 insertions, 58 deletions
diff --git a/include/sys/arc.h b/include/sys/arc.h
index dc2fd0364..59c0bea92 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -291,6 +291,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
+uint64_t arc_all_memory(void);
uint64_t arc_target_bytes(void);
void arc_init(void);
void arc_fini(void);
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 7dd5fe2b5..00b8b4758 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -57,7 +57,6 @@ int metaslab_sort_by_flushed(const void *, const void *);
uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
int metaslab_load(metaslab_t *);
-void metaslab_potentially_unload(metaslab_t *, uint64_t);
void metaslab_unload(metaslab_t *);
boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
@@ -110,7 +109,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-
+void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
@@ -133,7 +132,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
void metaslab_disable(metaslab_t *);
-void metaslab_enable(metaslab_t *, boolean_t);
+void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
+void metaslab_set_selected_txg(metaslab_t *, uint64_t);
extern int metaslab_debug_load;
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 08ee8d279..07f07c02d 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -36,6 +36,7 @@
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>
+#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
@@ -194,6 +195,12 @@ struct metaslab_class {
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ /*
+ * List of all loaded metaslabs in the class, sorted in order of most
+ * recent use.
+ */
+ multilist_t *mc_metaslab_txg_list;
};
/*
@@ -378,6 +385,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
uint64_t ms_allocated_this_txg;
+ uint64_t ms_allocating_total;
/*
* The following range trees are accessed only from syncing context.
@@ -508,6 +516,10 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+ /*
+ * Node in metaslab class's selected txg list
+ */
+ multilist_node_t ms_class_txg_node;
/*
* Allocs and frees that are committed to the vdev log spacemap but
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index c25f2f046..8a1048bee 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -389,6 +389,21 @@ Default value: \fB3600 seconds\fR (one hour)
.sp
.ne 2
.na
+\fBzfs_metaslab_mem_limit\fR (int)
+.ad
+.RS 12n
+When we are loading a new metaslab, we check the amount of memory being used
+to store metaslab range trees. If it is over a threshold, we attempt to unload
+the least recently used metaslab to prevent the system from clogging all of
+its memory with range trees. This tunable sets the percentage of total system
+memory that is the threshold.
+.sp
+Default value: \fB75 percent\fR
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_vdev_default_ms_count\fR (int)
.ad
.RS 12n
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 90a731bff..b5fca8e26 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1110,7 +1110,6 @@ static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
static void arc_tuning_update(void);
static void arc_prune_async(int64_t);
-static uint64_t arc_all_memory(void);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -4828,7 +4827,7 @@ arc_reduce_target_size(int64_t to_free)
* Return maximum amount of memory that we could possibly use. Reduced
* to half of all memory in user space which is primarily used for testing.
*/
-static uint64_t
+uint64_t
arc_all_memory(void)
{
#ifdef _KERNEL
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 9a9a5e0cf..2f92fffa4 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -278,6 +278,13 @@ int max_disabled_ms = 3;
*/
unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+/*
+ * Maximum percentage of memory to use on storing loaded metaslabs. If loading
+ * a metaslab would take it over this percentage, the oldest selected metaslab
+ * is automatically unloaded.
+ */
+int zfs_metaslab_mem_limit = 75;
+
static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -286,6 +293,8 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
+static unsigned int metaslab_idx_func(multilist_t *, void *);
+static void metaslab_evict(metaslab_t *, uint64_t);
#ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache;
#endif
@@ -306,6 +315,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
+ offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
sizeof (zfs_refcount_t), KM_SLEEP);
mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
@@ -332,6 +343,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
sizeof (uint64_t));
mutex_destroy(&mc->mc_lock);
+ multilist_destroy(mc->mc_metaslab_txg_list);
kmem_free(mc, sizeof (metaslab_class_t));
}
@@ -517,6 +529,47 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
return (space);
}
+void
+metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
+{
+ multilist_t *ml = mc->mc_metaslab_txg_list;
+ for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ metaslab_t *msp = multilist_sublist_head(mls);
+ multilist_sublist_unlock(mls);
+ while (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ /*
+ * Once we've hit a metaslab selected too recently to
+ * evict, we're done evicting for now.
+ */
+ if (msp->ms_selected_txg + metaslab_unload_delay >=
+ txg) {
+ mutex_exit(&msp->ms_lock);
+ break;
+ }
+
+ /*
+ * If the metaslab has been removed from the list
+ * (which could happen if we were at the memory limit
+ * and it was evicted during this loop), then we can't
+ * proceed and we should restart the sublist.
+ */
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ mutex_exit(&msp->ms_lock);
+ i--;
+ break;
+ }
+ mls = multilist_sublist_lock(ml, i);
+ metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+ multilist_sublist_unlock(mls);
+ metaslab_evict(msp, txg);
+ mutex_exit(&msp->ms_lock);
+ msp = next_msp;
+ }
+ }
+}
+
static int
metaslab_compare(const void *x1, const void *x2)
{
@@ -960,6 +1013,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp);
+
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ multilist_sublist_unlock(mls);
+
msp->ms_group = NULL;
mutex_exit(&mg->mg_lock);
}
@@ -1519,6 +1580,13 @@ metaslab_flush_wait(metaslab_t *msp)
cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
}
+static unsigned int
+metaslab_idx_func(multilist_t *ml, void *arg)
+{
+ metaslab_t *msp = arg;
+ return (msp->ms_id % multilist_get_num_sublists(ml));
+}
+
uint64_t
metaslab_allocated_space(metaslab_t *msp)
{
@@ -1577,6 +1645,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
+ ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
+ msp->ms_allocating_total);
ASSERT3U(msp->ms_deferspace, ==,
range_tree_space(msp->ms_defer[0]) +
@@ -1792,6 +1862,86 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
VERIFY3U(msp->ms_weight, ==, weight);
}
+/*
+ * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
+ * this class that was used longest ago, and attempt to unload it. We don't
+ * want to spend too much time in this loop to prevent performance
+ * degredation, and we expect that most of the time this operation will
+ * succeed. Between that and the normal unloading processing during txg sync,
+ * we expect this to keep the metaslab memory usage under control.
+ */
+static void
+metaslab_potentially_evict(metaslab_class_t *mc)
+{
+#ifdef _KERNEL
+ uint64_t allmem = arc_all_memory();
+ extern kmem_cache_t *range_seg_cache;
+ uint64_t inuse = range_seg_cache->skc_obj_total;
+ uint64_t size = range_seg_cache->skc_obj_size;
+ int tries = 0;
+ for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
+ tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
+ tries++) {
+ unsigned int idx = multilist_get_random_index(
+ mc->mc_metaslab_txg_list);
+ multilist_sublist_t *mls =
+ multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
+ metaslab_t *msp = multilist_sublist_head(mls);
+ multilist_sublist_unlock(mls);
+ while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
+ inuse * size) {
+ VERIFY3P(mls, ==, multilist_sublist_lock(
+ mc->mc_metaslab_txg_list, idx));
+ ASSERT3U(idx, ==,
+ metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
+
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ multilist_sublist_unlock(mls);
+ break;
+ }
+ metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+ multilist_sublist_unlock(mls);
+ /*
+ * If the metaslab is currently loading there are two
+ * cases. If it's the metaslab we're evicting, we
+ * can't continue on or we'll panic when we attempt to
+ * recursively lock the mutex. If it's another
+ * metaslab that's loading, it can be safely skipped,
+ * since we know it's very new and therefore not a
+ * good eviction candidate. We check later once the
+ * lock is held that the metaslab is fully loaded
+ * before actually unloading it.
+ */
+ if (msp->ms_loading) {
+ msp = next_msp;
+ inuse = range_seg_cache->skc_obj_total;
+ continue;
+ }
+ /*
+ * We can't unload metaslabs with no spacemap because
+ * they're not ready to be unloaded yet. We can't
+ * unload metaslabs with outstanding allocations
+ * because doing so could cause the metaslab's weight
+ * to decrease while it's unloaded, which violates an
+ * invariant that we use to prevent unnecessary
+ * loading. We also don't unload metaslabs that are
+ * currently active because they are high-weight
+ * metaslabs that are likely to be used in the near
+ * future.
+ */
+ mutex_enter(&msp->ms_lock);
+ if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
+ msp->ms_allocating_total == 0) {
+ metaslab_unload(msp);
+ }
+ mutex_exit(&msp->ms_lock);
+ msp = next_msp;
+ inuse = range_seg_cache->skc_obj_total;
+ }
+ }
+#endif
+}
+
static int
metaslab_load_impl(metaslab_t *msp)
{
@@ -2024,6 +2174,16 @@ metaslab_load(metaslab_t *msp)
*/
ASSERT(!msp->ms_loaded);
+ /*
+ * If we're loading a metaslab in the normal class, consider evicting
+ * another one to keep our memory usage under the limit defined by the
+ * zfs_metaslab_mem_limit tunable.
+ */
+ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
+ msp->ms_group->mg_class) {
+ metaslab_potentially_evict(msp->ms_group->mg_class);
+ }
+
int error = metaslab_load_impl(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -2038,7 +2198,13 @@ metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
- metaslab_verify_weight_and_frag(msp);
+ /*
+ * This can happen if a metaslab is selected for eviction (in
+ * metaslab_potentially_evict) and then unloaded during spa_sync (via
+ * metaslab_class_evict_old).
+ */
+ if (!msp->ms_loaded)
+ return;
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
@@ -2047,6 +2213,15 @@ metaslab_unload(metaslab_t *msp)
msp->ms_activation_weight = 0;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+ if (msp->ms_group != NULL) {
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ multilist_sublist_unlock(mls);
+ }
+
/*
* We explicitly recalculate the metaslab's weight based on its space
* map (as it is now not loaded). We want unload metaslabs to always
@@ -2064,6 +2239,20 @@ metaslab_unload(metaslab_t *msp)
}
void
+metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ msp->ms_selected_txg = txg;
+ multilist_sublist_insert_tail(mls, msp);
+ multilist_sublist_unlock(mls);
+}
+
+void
metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta)
{
@@ -2091,6 +2280,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
+ multilist_link_init(&ms->ms_class_txg_node);
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
@@ -2703,8 +2893,13 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
* If we're activating for the claim code, we don't want to actually
* set the metaslab up for a specific allocator.
*/
- if (activation_weight == METASLAB_WEIGHT_CLAIM)
+ if (activation_weight == METASLAB_WEIGHT_CLAIM) {
+ ASSERT0(msp->ms_activation_weight);
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort(mg, msp, msp->ms_weight |
+ activation_weight);
return (0);
+ }
metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
mg->mg_primaries : mg->mg_secondaries);
@@ -2719,6 +2914,12 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
ASSERT3S(msp->ms_allocator, ==, -1);
msp->ms_allocator = allocator;
msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+
+ ASSERT0(msp->ms_activation_weight);
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort_impl(mg, msp,
+ msp->ms_weight | activation_weight);
+
mutex_exit(&mg->mg_lock);
return (0);
@@ -2795,11 +2996,6 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
return (error);
}
- ASSERT0(msp->ms_activation_weight);
- msp->ms_activation_weight = msp->ms_weight;
- metaslab_group_sort(msp->ms_group, msp,
- msp->ms_weight | activation_weight);
-
ASSERT(msp->ms_loaded);
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
@@ -2894,14 +3090,15 @@ static void
metaslab_preload(void *arg)
{
metaslab_t *msp = arg;
- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ spa_t *spa = mc->mc_spa;
fstrans_cookie_t cookie = spl_fstrans_mark();
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock);
(void) metaslab_load(msp);
- msp->ms_selected_txg = spa_syncing_txg(spa);
+ metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
mutex_exit(&msp->ms_lock);
spl_fstrans_unmark(cookie);
}
@@ -3613,28 +3810,21 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_tx_commit(tx);
}
-void
-metaslab_potentially_unload(metaslab_t *msp, uint64_t txg)
+static void
+metaslab_evict(metaslab_t *msp, uint64_t txg)
{
- /*
- * If the metaslab is loaded and we've not tried to load or allocate
- * from it in 'metaslab_unload_delay' txgs, then unload it.
- */
- if (msp->ms_loaded &&
- msp->ms_disabled == 0 &&
- msp->ms_selected_txg + metaslab_unload_delay < txg) {
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
- VERIFY0(range_tree_space(
- msp->ms_allocating[(txg + t) & TXG_MASK]));
- }
- if (msp->ms_allocator != -1) {
- metaslab_passivate(msp, msp->ms_weight &
- ~METASLAB_ACTIVE_MASK);
- }
+ if (!msp->ms_loaded || msp->ms_disabled != 0)
+ return;
- if (!metaslab_debug_unload)
- metaslab_unload(msp);
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
}
+ if (msp->ms_allocator != -1)
+ metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
}
/*
@@ -3791,7 +3981,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_freed));
ASSERT0(range_tree_space(msp->ms_checkpointing));
-
+ msp->ms_allocating_total -= msp->ms_allocated_this_txg;
msp->ms_allocated_this_txg = 0;
mutex_exit(&msp->ms_lock);
}
@@ -4072,6 +4262,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+ msp->ms_allocating_total += size;
/* Track the last successful allocation */
msp->ms_alloc_txg = txg;
@@ -4250,6 +4441,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
ASSERT(msp->ms_loaded);
was_active = B_TRUE;
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
mg->mg_secondaries[allocator] != NULL) {
msp = mg->mg_secondaries[allocator];
@@ -4263,6 +4455,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
ASSERT(msp->ms_loaded);
was_active = B_TRUE;
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else {
msp = find_valid_metaslab(mg, activation_weight, dva, d,
want_unique, asize, allocator, try_hard, zal,
@@ -4293,7 +4486,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock. We check the
- * active status first to see if we need to reselect
+ * active status first to see if we need to set_selected_txg
* a new metaslab.
*/
if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
@@ -4336,7 +4529,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
continue;
}
- msp->ms_selected_txg = txg;
+ metaslab_set_selected_txg(msp, txg);
int activation_error =
metaslab_activate(msp, allocator, activation_weight);
@@ -5027,6 +5220,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
offset, size);
+ msp->ms_allocating_total -= size;
VERIFY(!msp->ms_condensing);
VERIFY3U(offset, >=, msp->ms_start);
@@ -5158,10 +5352,20 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
range_tree_clear(msp->ms_trim, offset, size);
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ msp->ms_selected_txg = txg;
+ multilist_sublist_insert_head(mls, msp);
+ }
+ multilist_sublist_unlock(mls);
+
if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_allocating[txg & TXG_MASK],
offset, size);
+ msp->ms_allocating_total += size;
}
mutex_exit(&msp->ms_lock);
@@ -5571,7 +5775,7 @@ metaslab_disable(metaslab_t *msp)
}
void
-metaslab_enable(metaslab_t *msp, boolean_t sync)
+metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
{
metaslab_group_t *mg = msp->ms_group;
spa_t *spa = mg->mg_vd->vdev_spa;
@@ -5589,6 +5793,8 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
if (--msp->ms_disabled == 0) {
mg->mg_ms_disabled--;
cv_broadcast(&mg->mg_ms_disabled_cv);
+ if (unload)
+ metaslab_unload(msp);
}
mutex_exit(&msp->ms_lock);
mutex_exit(&mg->mg_ms_disabled_lock);
@@ -5710,6 +5916,10 @@ MODULE_PARM_DESC(metaslab_df_use_largest_segment,
module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644);
MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec,
"how long to trust the cached max chunk size of a metaslab");
+
+module_param(zfs_metaslab_mem_limit, int, 0644);
+MODULE_PARM_DESC(zfs_metaslab_mem_limit,
+ "percentage of memory that can be used to store metaslab range trees");
/* END CSTYLED */
#endif
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 437efb50f..c404e876b 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -9013,6 +9013,10 @@ spa_sync(spa_t *spa, uint64_t txg)
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
!= NULL)
vdev_sync_done(vd, txg);
+
+ metaslab_class_evict_old(spa->spa_normal_class, txg);
+ metaslab_class_evict_old(spa->spa_log_class, txg);
+
spa_sync_close_syncing_log_sm(spa);
spa_update_dspace(spa);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index ad82e025e..550aa1e3a 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1189,6 +1189,7 @@ out:
if (metaslab_debug_load && m->ms_sm != NULL) {
VERIFY0(metaslab_load(m));
+ metaslab_set_selected_txg(m, 0);
}
mutex_exit(&m->ms_lock);
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5644b9c5b..a6280e011 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3262,20 +3262,6 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
!= NULL)
metaslab_sync_done(msp, txg);
- /*
- * Because this function is only called on dirty vdevs, it's possible
- * we won't consider all metaslabs for unloading on every
- * txg. However, unless the system is largely idle it is likely that
- * we will dirty all vdevs within a few txgs.
- */
- for (int i = 0; i < vd->vdev_ms_count; i++) {
- msp = vd->vdev_ms[i];
- mutex_enter(&msp->ms_lock);
- if (msp->ms_sm != NULL)
- metaslab_potentially_unload(msp, txg);
- mutex_exit(&msp->ms_lock);
- }
-
if (reassess)
metaslab_sync_reassess(vd->vdev_mg);
}
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index b15901326..a355f185c 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -483,6 +483,7 @@ vdev_initialize_thread(void *arg)
for (uint64_t i = 0; !vd->vdev_detached &&
i < vd->vdev_top->vdev_ms_count; i++) {
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ boolean_t unload_when_done = B_FALSE;
/*
* If we've expanded the top-level vdev or it's our
@@ -496,6 +497,8 @@ vdev_initialize_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG);
metaslab_disable(msp);
mutex_enter(&msp->ms_lock);
+ if (!msp->ms_loaded && !msp->ms_loading)
+ unload_when_done = B_TRUE;
VERIFY0(metaslab_load(msp));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
@@ -503,7 +506,7 @@ vdev_initialize_thread(void *arg)
mutex_exit(&msp->ms_lock);
error = vdev_initialize_ranges(vd, deadbeef);
- metaslab_enable(msp, B_TRUE);
+ metaslab_enable(msp, B_TRUE, unload_when_done);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 5ad47cccd..70b122a0a 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -837,7 +837,7 @@ vdev_trim_thread(void *arg)
*/
if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
mutex_exit(&msp->ms_lock);
- metaslab_enable(msp, B_FALSE);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_trim_calculate_progress(vd);
continue;
@@ -849,7 +849,7 @@ vdev_trim_thread(void *arg)
mutex_exit(&msp->ms_lock);
error = vdev_trim_ranges(&ta);
- metaslab_enable(msp, B_TRUE);
+ metaslab_enable(msp, B_TRUE, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
range_tree_vacate(ta.trim_tree, NULL, NULL);
@@ -1154,7 +1154,7 @@ vdev_autotrim_thread(void *arg)
if (msp->ms_sm == NULL ||
range_tree_is_empty(msp->ms_trim)) {
mutex_exit(&msp->ms_lock);
- metaslab_enable(msp, B_FALSE);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
continue;
}
@@ -1170,7 +1170,7 @@ vdev_autotrim_thread(void *arg)
*/
if (msp->ms_disabled > 1) {
mutex_exit(&msp->ms_lock);
- metaslab_enable(msp, B_FALSE);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
continue;
}
@@ -1288,7 +1288,7 @@ vdev_autotrim_thread(void *arg)
range_tree_vacate(trim_tree, NULL, NULL);
range_tree_destroy(trim_tree);
- metaslab_enable(msp, issued_trim);
+ metaslab_enable(msp, issued_trim, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
for (uint64_t c = 0; c < children; c++) {