aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/metaslab.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/metaslab.c')
-rw-r--r--module/zfs/metaslab.c594
1 files changed, 443 insertions, 151 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index d4dacbc2f..7b8ac91b1 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -36,6 +36,7 @@
#include <sys/zfeature.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/zap.h>
+#include <sys/btree.h>
#define WITH_DF_BLOCK_ALLOCATOR
@@ -184,6 +185,13 @@ int metaslab_df_free_pct = 4;
int metaslab_df_max_search = 16 * 1024 * 1024;
/*
+ * Forces the metaslab_block_picker function to search for at least this many
+ * segments forwards until giving up on finding a segment that the allocation
+ * will fit into.
+ */
+uint32_t metaslab_min_search_count = 100;
+
+/*
* If we are not searching forward (due to metaslab_df_max_search,
* metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
* controls what segment is used. If it is set, we will use the largest free
@@ -277,17 +285,32 @@ uint64_t metaslab_trace_max_entries = 5000;
int max_disabled_ms = 3;
/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
+/*
* Maximum percentage of memory to use on storing loaded metaslabs. If loading
* a metaslab would take it over this percentage, the oldest selected metaslab
* is automatically unloaded.
*/
-int zfs_metaslab_mem_limit = 25;
+int zfs_metaslab_mem_limit = 75;
/*
- * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
- * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ * Force the per-metaslab range trees to use 64-bit integers to store
+ * segments. Used for debugging purposes.
*/
-unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+
+/*
+ * By default we only store segments over a certain size in the size-sorted
+ * metaslab trees (ms_allocatable_by_size and
+ * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
+ * improves load and unload times at the cost of causing us to use slightly
+ * larger segments than we would otherwise in some cases.
+ */
+uint32_t metaslab_by_size_min_shift = 14;
static uint64_t metaslab_weight(metaslab_t *, boolean_t);
static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
@@ -299,8 +322,66 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
static unsigned int metaslab_idx_func(multilist_t *, void *);
static void metaslab_evict(metaslab_t *, uint64_t);
+static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
#ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache;
+
+typedef struct metaslab_stats {
+ kstat_named_t metaslabstat_trace_over_limit;
+ kstat_named_t metaslabstat_df_find_under_floor;
+ kstat_named_t metaslabstat_reload_tree;
+} metaslab_stats_t;
+
+static metaslab_stats_t metaslab_stats = {
+ { "trace_over_limit", KSTAT_DATA_UINT64 },
+ { "df_find_under_floor", KSTAT_DATA_UINT64 },
+ { "reload_tree", KSTAT_DATA_UINT64 },
+};
+
+#define METASLABSTAT_BUMP(stat) \
+ atomic_inc_64(&metaslab_stats.stat.value.ui64);
+
+
+kstat_t *metaslab_ksp;
+
+void
+metaslab_stat_init(void)
+{
+ ASSERT(metaslab_alloc_trace_cache == NULL);
+ metaslab_alloc_trace_cache = kmem_cache_create(
+ "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
+ "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
+ sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (metaslab_ksp != NULL) {
+ metaslab_ksp->ks_data = &metaslab_stats;
+ kstat_install(metaslab_ksp);
+ }
+}
+
+void
+metaslab_stat_fini(void)
+{
+ if (metaslab_ksp != NULL) {
+ kstat_delete(metaslab_ksp);
+ metaslab_ksp = NULL;
+ }
+
+ kmem_cache_destroy(metaslab_alloc_trace_cache);
+ metaslab_alloc_trace_cache = NULL;
+}
+#else
+
+void
+metaslab_stat_init(void)
+{
+}
+
+void
+metaslab_stat_fini(void)
+{
+}
#endif
/*
@@ -608,13 +689,13 @@ metaslab_compare(const void *x1, const void *x2)
if (sort1 > sort2)
return (1);
- int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
+ int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
if (likely(cmp))
return (cmp);
- IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+ IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
- return (AVL_CMP(m1->ms_start, m2->ms_start));
+ return (TREE_CMP(m1->ms_start, m2->ms_start));
}
/*
@@ -711,17 +792,17 @@ metaslab_sort_by_flushed(const void *va, const void *vb)
const metaslab_t *a = va;
const metaslab_t *b = vb;
- int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+ int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
if (likely(cmp))
return (cmp);
uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
- cmp = AVL_CMP(a_vdev_id, b_vdev_id);
+ cmp = TREE_CMP(a_vdev_id, b_vdev_id);
if (cmp)
return (cmp);
- return (AVL_CMP(a->ms_id, b->ms_id));
+ return (TREE_CMP(a->ms_id, b->ms_id));
}
metaslab_group_t *
@@ -1212,25 +1293,170 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
*/
/*
- * Comparison function for the private size-ordered tree. Tree is sorted
- * by size, larger sizes at the end of the tree.
+ * Comparison function for the private size-ordered tree using 32-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize32_compare(const void *x1, const void *x2)
+{
+ const range_seg32_t *r1 = x1;
+ const range_seg32_t *r2 = x2;
+
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+ int cmp = TREE_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+
+/*
+ * Comparison function for the private size-ordered tree using 64-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
static int
-metaslab_rangesize_compare(const void *x1, const void *x2)
+metaslab_rangesize64_compare(const void *x1, const void *x2)
{
- const range_seg_t *r1 = x1;
- const range_seg_t *r2 = x2;
+ const range_seg64_t *r1 = x1;
+ const range_seg64_t *r2 = x2;
+
uint64_t rs_size1 = r1->rs_end - r1->rs_start;
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
- int cmp = AVL_CMP(rs_size1, rs_size2);
+ int cmp = TREE_CMP(rs_size1, rs_size2);
if (likely(cmp))
return (cmp);
- return (AVL_CMP(r1->rs_start, r2->rs_start));
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+typedef struct metaslab_rt_arg {
+ zfs_btree_t *mra_bt;
+ uint32_t mra_floor_shift;
+} metaslab_rt_arg_t;
+
+struct mssa_arg {
+ range_tree_t *rt;
+ metaslab_rt_arg_t *mra;
+};
+
+static void
+metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
+{
+ struct mssa_arg *mssap = arg;
+ range_tree_t *rt = mssap->rt;
+ metaslab_rt_arg_t *mrap = mssap->mra;
+ range_seg_max_t seg = {0};
+ rs_set_start(&seg, rt, start);
+ rs_set_end(&seg, rt, start + size);
+ metaslab_rt_add(rt, &seg, mrap);
+}
+
+static void
+metaslab_size_tree_full_load(range_tree_t *rt)
+{
+ metaslab_rt_arg_t *mrap = rt->rt_arg;
+#ifdef _METASLAB_TRACING
+ METASLABSTAT_BUMP(metaslabstat_reload_tree);
+#endif
+ ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
+ mrap->mra_floor_shift = 0;
+ struct mssa_arg arg = {0};
+ arg.rt = rt;
+ arg.mra = mrap;
+ range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
/*
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
+ */
+/* ARGSUSED */
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ size_t size;
+ int (*compare) (const void *, const void *);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ compare = metaslab_rangesize32_compare;
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ compare = metaslab_rangesize64_compare;
+ break;
+ default:
+ panic("Invalid range seg type %d", rt->rt_type);
+ }
+ zfs_btree_create(size_tree, compare, size);
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ zfs_btree_destroy(size_tree);
+ kmem_free(mrap, sizeof (*mrap));
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
+ (1 << mrap->mra_floor_shift))
+ return;
+
+ zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+ mrap->mra_floor_shift))
+ return;
+
+ zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+ zfs_btree_clear(size_tree);
+ zfs_btree_destroy(size_tree);
+
+ metaslab_rt_create(rt, arg);
+}
+
+static range_tree_ops_t metaslab_rt_ops = {
+ .rtop_create = metaslab_rt_create,
+ .rtop_destroy = metaslab_rt_destroy,
+ .rtop_add = metaslab_rt_add,
+ .rtop_remove = metaslab_rt_remove,
+ .rtop_vacate = metaslab_rt_vacate
+};
+
+/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
@@ -1242,16 +1468,20 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
uint64_t
metaslab_largest_allocatable(metaslab_t *msp)
{
- avl_tree_t *t = &msp->ms_allocatable_by_size;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
range_seg_t *rs;
if (t == NULL)
return (0);
- rs = avl_last(t);
+ if (zfs_btree_numnodes(t) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+
+ rs = zfs_btree_last(t, NULL);
if (rs == NULL)
return (0);
- return (rs->rs_end - rs->rs_start);
+ return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
+ msp->ms_allocatable));
}
/*
@@ -1266,7 +1496,10 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
if (msp->ms_unflushed_frees == NULL)
return (0);
- range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+ if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
+ metaslab_size_tree_full_load(msp->ms_unflushed_frees);
+ range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
+ NULL);
if (rs == NULL)
return (0);
@@ -1293,8 +1526,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
* the largest segment; there may be other usable chunks in the
* largest segment, but we ignore them.
*/
- uint64_t rstart = rs->rs_start;
- uint64_t rsize = rs->rs_end - rstart;
+ uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
+ uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
uint64_t start = 0;
uint64_t size = 0;
@@ -1318,17 +1551,18 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
}
static range_seg_t *
-metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
+metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
+ uint64_t size, zfs_btree_index_t *where)
{
- range_seg_t *rs, rsearch;
- avl_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch;
- rsearch.rs_start = start;
- rsearch.rs_end = start + size;
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, start + size);
- rs = avl_find(t, &rsearch, &where);
+ rs = zfs_btree_find(t, &rsearch, where);
if (rs == NULL) {
- rs = avl_nearest(t, where, AVL_AFTER);
+ rs = zfs_btree_next(t, where, where);
}
return (rs);
@@ -1337,27 +1571,34 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR)
/*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * This is a helper function that can be used by the allocator to find a
+ * suitable block to allocate. This will search the specified B-tree looking
+ * for a block that matches the specified criteria.
*/
static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
uint64_t max_search)
{
- range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+ if (*cursor == 0)
+ *cursor = rt->rt_start;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t where;
+ range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
uint64_t first_found;
+ int count_searched = 0;
if (rs != NULL)
- first_found = rs->rs_start;
+ first_found = rs_get_start(rs, rt);
- while (rs != NULL && rs->rs_start - first_found <= max_search) {
- uint64_t offset = rs->rs_start;
- if (offset + size <= rs->rs_end) {
+ while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
+ max_search || count_searched < metaslab_min_search_count)) {
+ uint64_t offset = rs_get_start(rs, rt);
+ if (offset + size <= rs_get_end(rs, rt)) {
*cursor = offset + size;
return (offset);
}
- rs = AVL_NEXT(t, rs);
+ rs = zfs_btree_next(bt, &where, &where);
+ count_searched++;
}
*cursor = 0;
@@ -1403,8 +1644,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
uint64_t offset;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(&rt->rt_root), ==,
- avl_numnodes(&msp->ms_allocatable_by_size));
/*
* If we're running low on space, find a segment based on size,
@@ -1414,22 +1653,33 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
free_pct < metaslab_df_free_pct) {
offset = -1;
} else {
- offset = metaslab_block_picker(&rt->rt_root,
+ offset = metaslab_block_picker(rt,
cursor, size, metaslab_df_max_search);
}
if (offset == -1) {
range_seg_t *rs;
+ if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
if (metaslab_df_use_largest_segment) {
/* use largest free segment */
- rs = avl_last(&msp->ms_allocatable_by_size);
+ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
} else {
+ zfs_btree_index_t where;
/* use segment of this size, or next largest */
+#ifdef _METASLAB_TRACING
+ metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
+ if (size < (1 << mrap->mra_floor_shift)) {
+ METASLABSTAT_BUMP(
+ metaslabstat_df_find_under_floor);
+ }
+#endif
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
- 0, size);
+ rt, msp->ms_start, size, &where);
}
- if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
- offset = rs->rs_start;
+ if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
+ rt)) {
+ offset = rs_get_start(rs, rt);
*cursor = offset + size;
}
}
@@ -1458,25 +1708,27 @@ static uint64_t
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
range_tree_t *rt = msp->ms_allocatable;
- avl_tree_t *t = &msp->ms_allocatable_by_size;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
uint64_t *cursor = &msp->ms_lbas[0];
uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
ASSERT3U(*cursor_end, >=, *cursor);
if ((*cursor + size) > *cursor_end) {
range_seg_t *rs;
- rs = avl_last(&msp->ms_allocatable_by_size);
- if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+ if (zfs_btree_numnodes(t) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+ rs = zfs_btree_last(t, NULL);
+ if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
+ size)
return (-1ULL);
- *cursor = rs->rs_start;
- *cursor_end = rs->rs_end;
+ *cursor = rs_get_start(rs, rt);
+ *cursor_end = rs_get_end(rs, rt);
}
offset = *cursor;
@@ -1511,39 +1763,40 @@ uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &msp->ms_allocatable->rt_root;
- avl_index_t where;
- range_seg_t *rs, rsearch;
+ zfs_btree_t *t = &msp->ms_allocatable->rt_root;
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch;
uint64_t hbit = highbit64(size);
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
uint64_t max_size = metaslab_largest_allocatable(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==,
- avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
- rsearch.rs_start = *cursor;
- rsearch.rs_end = *cursor + size;
+ rs_set_start(&rsearch, rt, *cursor);
+ rs_set_end(&rsearch, rt, *cursor + size);
- rs = avl_find(t, &rsearch, &where);
- if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+ rs = zfs_btree_find(t, &rsearch, &where);
+ if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
t = &msp->ms_allocatable_by_size;
- rsearch.rs_start = 0;
- rsearch.rs_end = MIN(max_size,
- 1ULL << (hbit + metaslab_ndf_clump_shift));
- rs = avl_find(t, &rsearch, &where);
+ rs_set_start(&rsearch, rt, 0);
+ rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
+ metaslab_ndf_clump_shift)));
+
+ rs = zfs_btree_find(t, &rsearch, &where);
if (rs == NULL)
- rs = avl_nearest(t, where, AVL_AFTER);
+ rs = zfs_btree_next(t, &where, &where);
ASSERT(rs != NULL);
}
- if ((rs->rs_end - rs->rs_start) >= size) {
- *cursor = rs->rs_start + size;
- return (rs->rs_start);
+ if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
+ *cursor = rs_get_start(rs, rt) + size;
+ return (rs_get_start(rs, rt));
}
return (-1ULL);
}
@@ -1889,9 +2142,8 @@ metaslab_potentially_evict(metaslab_class_t *mc)
{
#ifdef _KERNEL
uint64_t allmem = arc_all_memory();
- extern kmem_cache_t *range_seg_cache;
- uint64_t inuse = range_seg_cache->skc_obj_total;
- uint64_t size = range_seg_cache->skc_obj_size;
+ uint64_t inuse = zfs_btree_leaf_cache->skc_obj_total;
+ uint64_t size = zfs_btree_leaf_cache->skc_obj_size;
int tries = 0;
for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
@@ -1928,7 +2180,7 @@ metaslab_potentially_evict(metaslab_class_t *mc)
*/
if (msp->ms_loading) {
msp = next_msp;
- inuse = range_seg_cache->skc_obj_total;
+ inuse = zfs_btree_leaf_cache->skc_obj_total;
continue;
}
/*
@@ -1950,7 +2202,7 @@ metaslab_potentially_evict(metaslab_class_t *mc)
}
mutex_exit(&msp->ms_lock);
msp = next_msp;
- inuse = range_seg_cache->skc_obj_total;
+ inuse = zfs_btree_leaf_cache->skc_obj_total;
}
}
#endif
@@ -1993,11 +2245,40 @@ metaslab_load_impl(metaslab_t *msp)
mutex_exit(&msp->ms_lock);
hrtime_t load_start = gethrtime();
+ metaslab_rt_arg_t *mrap;
+ if (msp->ms_allocatable->rt_arg == NULL) {
+ mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ } else {
+ mrap = msp->ms_allocatable->rt_arg;
+ msp->ms_allocatable->rt_ops = NULL;
+ msp->ms_allocatable->rt_arg = NULL;
+ }
+ mrap->mra_bt = &msp->ms_allocatable_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+
if (msp->ms_sm != NULL) {
error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
SM_FREE, length);
+
+ /* Now, populate the size-sorted tree. */
+ metaslab_rt_create(msp->ms_allocatable, mrap);
+ msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+ msp->ms_allocatable->rt_arg = mrap;
+
+ struct mssa_arg arg = {0};
+ arg.rt = msp->ms_allocatable;
+ arg.mra = mrap;
+ range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
+ &arg);
} else {
/*
+ * Add the size-sorted tree first, since we don't need to load
+ * the metaslab from the spacemap.
+ */
+ metaslab_rt_create(msp->ms_allocatable, mrap);
+ msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+ msp->ms_allocatable->rt_arg = mrap;
+ /*
* The space map has not been allocated yet, so treat
* all the space in the metaslab as free and add it to the
* ms_allocatable tree.
@@ -2252,6 +2533,29 @@ metaslab_unload(metaslab_t *msp)
metaslab_recalculate_weight_and_sort(msp);
}
+/*
+ * We want to optimize the memory use of the per-metaslab range
+ * trees. To do this, we store the segments in the range trees in
+ * units of sectors, zero-indexing from the start of the metaslab. If
+ * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
+ * the ranges using two uint32_ts, rather than two uint64_ts.
+ */
+static range_seg_type_t
+metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
+ uint64_t *start, uint64_t *shift)
+{
+ if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
+ !zfs_metaslab_force_large_segs) {
+ *shift = vdev->vdev_ashift;
+ *start = msp->ms_start;
+ return (RANGE_SEG32);
+ } else {
+ *shift = 0;
+ *start = 0;
+ return (RANGE_SEG64);
+ }
+}
+
void
metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
{
@@ -2327,6 +2631,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
+
/*
* We create the ms_allocatable here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
@@ -2335,10 +2643,9 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
* we'd data fault on any attempt to use this metaslab before
* it's ready.
*/
- ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
- &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
+ ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
- ms->ms_trim = range_tree_create(NULL, NULL);
+ ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms, B_FALSE);
@@ -2393,7 +2700,7 @@ metaslab_unflushed_changes_memused(metaslab_t *ms)
{
return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
range_tree_numsegs(ms->ms_unflushed_frees)) *
- sizeof (range_seg_t));
+ ms->ms_unflushed_allocs->rt_root.bt_elem_size);
}
void
@@ -3187,7 +3494,7 @@ metaslab_should_condense(metaslab_t *msp)
* We always condense metaslabs that are empty and metaslabs for
* which a condense request has been made.
*/
- if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+ if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
msp->ms_condense_wanted)
return (B_TRUE);
@@ -3233,28 +3540,29 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
* So to truncate the space map to represent all the entries of
* previous TXGs we do the following:
*
- * 1] We create a range tree (condense tree) that is 100% allocated.
- * 2] We remove from it all segments found in the ms_defer trees
+ * 1] We create a range tree (condense tree) that is 100% empty.
+ * 2] We add to it all segments found in the ms_defer trees
* as those segments are marked as free in the original space
* map. We do the same with the ms_allocating trees for the same
- * reason. Removing these segments should be a relatively
+ * reason. Adding these segments should be a relatively
* inexpensive operation since we expect these trees to have a
* small number of nodes.
- * 3] We vacate any unflushed allocs as they should already exist
- * in the condense tree. Then we vacate any unflushed frees as
- * they should already be part of ms_allocatable.
- * 4] At this point, we would ideally like to remove all segments
+ * 3] We vacate any unflushed allocs, since they are not frees we
+ * need to add to the condense tree. Then we vacate any
+ * unflushed frees as they should already be part of ms_allocatable.
+ * 4] At this point, we would ideally like to add all segments
* in the ms_allocatable tree from the condense tree. This way
* we would write all the entries of the condense tree as the
- * condensed space map, which would only contain allocated
- * segments with everything else assumed to be freed.
+ * condensed space map, which would only contain freeed
+ * segments with everything else assumed to be allocated.
*
* Doing so can be prohibitively expensive as ms_allocatable can
- * be large, and therefore computationally expensive to subtract
- * from the condense_tree. Instead we first sync out the
- * condense_tree and then the ms_allocatable, in the condensed
- * space map. While this is not optimal, it is typically close to
- * optimal and more importantly much cheaper to compute.
+ * be large, and therefore computationally expensive to add to
+ * the condense_tree. Instead we first sync out an entry marking
+ * everything as allocated, then the condense_tree and then the
+ * ms_allocatable, in the condensed space map. While this is not
+ * optimal, it is typically close to optimal and more importantly
+ * much cheaper to compute.
*
* 5] Finally, as both of the unflushed trees were written to our
* new and condensed metaslab space map, we basically flushed
@@ -3268,22 +3576,26 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
spa->spa_name, space_map_length(msp->ms_sm),
- avl_numnodes(&msp->ms_allocatable->rt_root),
+ range_tree_numsegs(msp->ms_allocatable),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
- condense_tree = range_tree_create(NULL, NULL);
- range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
+ &start, &shift);
+
+ condense_tree = range_tree_create(NULL, type, NULL, start, shift);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
- range_tree_remove, condense_tree);
+ range_tree_add, condense_tree);
}
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
- range_tree_remove, condense_tree);
+ range_tree_add, condense_tree);
}
ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
@@ -3331,11 +3643,17 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
* followed by FREES (due to space_map_write() in metaslab_sync()) for
* sync pass 1.
*/
- space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
+ shift);
+ range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
+ space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+ space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
+ range_tree_vacate(tmp_tree, NULL, NULL);
+ range_tree_destroy(tmp_tree);
mutex_enter(&msp->ms_lock);
msp->ms_condensing = B_FALSE;
@@ -3578,7 +3896,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
return;
- VERIFY(txg <= spa_final_dirty_txg(spa));
+ VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
/*
* The only state that can actually be changing concurrently
@@ -3867,32 +4185,46 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* range trees and add its capacity to the vdev.
*/
if (msp->ms_freed == NULL) {
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(vd, msp, &start,
+ &shift);
+
for (int t = 0; t < TXG_SIZE; t++) {
ASSERT(msp->ms_allocating[t] == NULL);
- msp->ms_allocating[t] = range_tree_create(NULL, NULL);
+ msp->ms_allocating[t] = range_tree_create(NULL, type,
+ NULL, start, shift);
}
ASSERT3P(msp->ms_freeing, ==, NULL);
- msp->ms_freeing = range_tree_create(NULL, NULL);
+ msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
+ shift);
ASSERT3P(msp->ms_freed, ==, NULL);
- msp->ms_freed = range_tree_create(NULL, NULL);
+ msp->ms_freed = range_tree_create(NULL, type, NULL, start,
+ shift);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
ASSERT3P(msp->ms_defer[t], ==, NULL);
- msp->ms_defer[t] = range_tree_create(NULL, NULL);
+ msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
+ start, shift);
}
ASSERT3P(msp->ms_checkpointing, ==, NULL);
- msp->ms_checkpointing = range_tree_create(NULL, NULL);
+ msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
+ start, shift);
ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
- msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
+ msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
+ start, shift);
+
+ metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
- msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
- &msp->ms_unflushed_frees_by_size,
- metaslab_rangesize_compare, 0);
+ msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
+ type, mrap, start, shift);
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
@@ -4052,36 +4384,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva)
* ==========================================================================
*/
#ifdef _METASLAB_TRACING
-kstat_t *metaslab_trace_ksp;
-kstat_named_t metaslab_trace_over_limit;
-
-void
-metaslab_alloc_trace_init(void)
-{
- ASSERT(metaslab_alloc_trace_cache == NULL);
- metaslab_alloc_trace_cache = kmem_cache_create(
- "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
- metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
- "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
- if (metaslab_trace_ksp != NULL) {
- metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
- kstat_named_init(&metaslab_trace_over_limit,
- "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
- kstat_install(metaslab_trace_ksp);
- }
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
- if (metaslab_trace_ksp != NULL) {
- kstat_delete(metaslab_trace_ksp);
- metaslab_trace_ksp = NULL;
- }
- kmem_cache_destroy(metaslab_alloc_trace_cache);
- metaslab_alloc_trace_cache = NULL;
-}
/*
* Add an allocation trace element to the allocation tracing list.
@@ -4108,7 +4410,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
#ifdef DEBUG
panic("too many entries in allocation list");
#endif
- atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
+ METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
zal->zal_size--;
mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
list_remove(&zal->zal_list, mat_next);
@@ -4161,16 +4463,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
void
-metaslab_alloc_trace_init(void)
-{
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
-}
-
-void
metaslab_trace_init(zio_alloc_list_t *zal)
{
}