aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c19
-rw-r--r--include/sys/Makefile.am2
-rw-r--r--include/sys/avl.h6
-rw-r--r--include/sys/bitops.h90
-rw-r--r--include/sys/btree.h238
-rw-r--r--include/sys/metaslab.h4
-rw-r--r--include/sys/metaslab_impl.h4
-rw-r--r--include/sys/range_tree.h242
-rw-r--r--include/sys/spa.h46
-rw-r--r--include/sys/space_reftree.h2
-rw-r--r--include/sys/vdev.h4
-rw-r--r--include/sys/vdev_impl.h8
-rw-r--r--lib/libzfs/libzfs_dataset.c2
-rw-r--r--lib/libzfs/libzfs_iter.c2
-rw-r--r--lib/libzfs/libzfs_sendrecv.c2
-rw-r--r--lib/libzpool/Makefile.am1
-rw-r--r--lib/libzutil/zutil_import.c4
-rw-r--r--module/os/linux/zfs/zfs_znode.c2
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/arc.c3
-rw-r--r--module/zfs/btree.c2124
-rw-r--r--module/zfs/ddt.c2
-rw-r--r--module/zfs/dmu_objset.c2
-rw-r--r--module/zfs/dmu_recv.c2
-rw-r--r--module/zfs/dmu_send.c14
-rw-r--r--module/zfs/dnode.c11
-rw-r--r--module/zfs/dsl_bookmark.c6
-rw-r--r--module/zfs/dsl_deadlist.c8
-rw-r--r--module/zfs/dsl_deleg.c2
-rw-r--r--module/zfs/dsl_scan.c65
-rw-r--r--module/zfs/metaslab.c594
-rw-r--r--module/zfs/range_tree.c564
-rw-r--r--module/zfs/sa.c6
-rw-r--r--module/zfs/spa.c2
-rw-r--r--module/zfs/spa_misc.c15
-rw-r--r--module/zfs/space_map.c37
-rw-r--r--module/zfs/space_reftree.c15
-rw-r--r--module/zfs/unique.c2
-rw-r--r--module/zfs/vdev.c25
-rw-r--r--module/zfs/vdev_cache.c4
-rw-r--r--module/zfs/vdev_initialize.c31
-rw-r--r--module/zfs/vdev_label.c6
-rw-r--r--module/zfs/vdev_queue.c8
-rw-r--r--module/zfs/vdev_raidz.c6
-rw-r--r--module/zfs/vdev_removal.c66
-rw-r--r--module/zfs/vdev_trim.c43
-rw-r--r--module/zfs/zap_micro.c4
-rw-r--r--module/zfs/zfs_fuid.c4
-rw-r--r--module/zfs/zfs_rlock.c2
-rw-r--r--module/zfs/zil.c8
50 files changed, 3722 insertions, 638 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index a6368c67d..0f09ec70a 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -103,6 +103,7 @@ extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
extern int zfs_vdev_async_read_max_active;
extern boolean_t spa_load_verify_dryrun;
extern int zfs_reconstruct_indirect_combinations_max;
+extern int zfs_btree_verify_intensity;
static const char cmdname[] = "zdb";
uint8_t dump_opt[256];
@@ -949,7 +950,7 @@ dump_metaslab_stats(metaslab_t *msp)
{
char maxbuf[32];
range_tree_t *rt = msp->ms_allocatable;
- avl_tree_t *t = &msp->ms_allocatable_by_size;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
/* max sure nicenum has enough space */
@@ -958,7 +959,7 @@ dump_metaslab_stats(metaslab_t *msp)
zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
- "segments", avl_numnodes(t), "maxsize", maxbuf,
+ "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
"freepct", free_pct);
(void) printf("\tIn-memory histogram:\n");
dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
@@ -3141,7 +3142,7 @@ cksum_record_compare(const void *x1, const void *x2)
int difference;
for (int i = 0; i < arraysize; i++) {
- difference = AVL_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
+ difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
if (difference)
break;
}
@@ -4063,7 +4064,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
ASSERT0(range_tree_space(svr->svr_allocd_segs));
- range_tree_t *allocs = range_tree_create(NULL, NULL);
+ range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
metaslab_t *msp = vd->vdev_ms[msi];
@@ -6088,7 +6089,8 @@ dump_zpool(spa_t *spa)
if (dump_opt['d'] || dump_opt['i']) {
spa_feature_t f;
- mos_refd_objs = range_tree_create(NULL, NULL);
+ mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+ 0);
dump_objset(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
@@ -6643,6 +6645,13 @@ main(int argc, char **argv)
if (spa_config_path_env != NULL)
spa_config_path = spa_config_path_env;
+ /*
+ * For performance reasons, we set this tunable down. We do so before
+ * the arg parsing section so that the user can override this value if
+ * they choose.
+ */
+ zfs_btree_verify_intensity = 3;
+
while ((c = getopt(argc, argv,
"AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) {
switch (c) {
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 734d5c161..5785bfd55 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -7,10 +7,12 @@ COMMON_H = \
$(top_srcdir)/include/sys/arc_impl.h \
$(top_srcdir)/include/sys/avl.h \
$(top_srcdir)/include/sys/avl_impl.h \
+ $(top_srcdir)/include/sys/bitops.h \
$(top_srcdir)/include/sys/blkptr.h \
$(top_srcdir)/include/sys/bplist.h \
$(top_srcdir)/include/sys/bpobj.h \
$(top_srcdir)/include/sys/bptree.h \
+ $(top_srcdir)/include/sys/btree.h \
$(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/cityhash.h \
$(top_srcdir)/include/sys/dataset_kstats.h \
diff --git a/include/sys/avl.h b/include/sys/avl.h
index 962e8b1cf..6c4e7ed5c 100644
--- a/include/sys/avl.h
+++ b/include/sys/avl.h
@@ -108,9 +108,9 @@ extern "C" {
/*
* AVL comparator helpers
*/
-#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
-#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
-#define AVL_PCMP(a, b) \
+#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define TREE_PCMP(a, b) \
(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
/*
diff --git a/include/sys/bitops.h b/include/sys/bitops.h
new file mode 100644
index 000000000..56d52073b
--- /dev/null
+++ b/include/sys/bitops.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#ifndef _SYS_BITOPS_H
+#define _SYS_BITOPS_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1U << (len)); \
+ ASSERT3U(low + len, <=, 32); \
+ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BF64_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1ULL << (len)); \
+ ASSERT3U(low + len, <=, 64); \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+/*
+ * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in
+ * the case where val is a constant. We can't fix ASSERT because it's used as
+ * an expression in several places in the kernel; as a result, changing it to
+ * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option.
+ */
+#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BITOPS_H */
diff --git a/include/sys/btree.h b/include/sys/btree.h
new file mode 100644
index 000000000..1673b5217
--- /dev/null
+++ b/include/sys/btree.h
@@ -0,0 +1,238 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _BTREE_H
+#define _BTREE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+
+/*
+ * This file defines the interface for a B-Tree implementation for ZFS. The
+ * tree can be used to store arbitrary sortable data types with low overhead
+ * and good operation performance. In addition the tree intelligently
+ * optimizes bulk in-order insertions to improve memory use and performance.
+ *
+ * Note that for all B-Tree functions, the values returned are pointers to the
+ * internal copies of the data in the tree. The internal data can only be
+ * safely mutated if the changes cannot change the ordering of the element
+ * with respect to any other elements in the tree.
+ *
+ * The major drawback of the B-Tree is that any returned elements or indexes
+ * are only valid until a side-effectful operation occurs, since these can
+ * result in reallocation or relocation of data. Side effectful operations are
+ * defined as insertion, removal, and zfs_btree_destroy_nodes.
+ *
+ * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core
+ * nodes have an array of children pointing to other nodes, and an array of
+ * elements that act as separators between the elements of the subtrees rooted
+ * at its children. Leaf nodes only contain data elements, and form the bottom
+ * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the
+ * elements in the core nodes are not copies of or references to leaf node
+ * elements. Each element occcurs only once in the tree, no matter what kind
+ * of node it is in.
+ *
+ * The tree's height is the same throughout, unlike many other forms of search
+ * tree. Each node (except for the root) must be between half minus one and
+ * completely full of elements (and children) at all times. Any operation that
+ * would put the node outside of that range results in a rebalancing operation
+ * (taking, merging, or splitting).
+ *
+ * This tree was implemented using descriptions from Wikipedia's articles on
+ * B-Trees and B+ Trees.
+ */
+
+/*
+ * Decreasing these values results in smaller memmove operations, but more of
+ * them, and increased memory overhead. Increasing these values results in
+ * higher variance in operation time, and reduces memory overhead.
+ */
+#define BTREE_CORE_ELEMS 128
+#define BTREE_LEAF_SIZE 4096
+
+extern kmem_cache_t *zfs_btree_leaf_cache;
+
+typedef struct zfs_btree_hdr {
+ struct zfs_btree_core *bth_parent;
+ boolean_t bth_core;
+ /*
+ * For both leaf and core nodes, represents the number of elements in
+ * the node. For core nodes, they will have bth_count + 1 children.
+ */
+ uint32_t bth_count;
+} zfs_btree_hdr_t;
+
+typedef struct zfs_btree_core {
+ zfs_btree_hdr_t btc_hdr;
+ zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1];
+ uint8_t btc_elems[];
+} zfs_btree_core_t;
+
+typedef struct zfs_btree_leaf {
+ zfs_btree_hdr_t btl_hdr;
+ uint8_t btl_elems[];
+} zfs_btree_leaf_t;
+
+typedef struct zfs_btree_index {
+ zfs_btree_hdr_t *bti_node;
+ uint64_t bti_offset;
+ /*
+ * True if the location is before the list offset, false if it's at
+ * the listed offset.
+ */
+ boolean_t bti_before;
+} zfs_btree_index_t;
+
+typedef struct btree {
+ zfs_btree_hdr_t *bt_root;
+ int64_t bt_height;
+ size_t bt_elem_size;
+ uint64_t bt_num_elems;
+ uint64_t bt_num_nodes;
+ zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
+ int (*bt_compar) (const void *, const void *);
+} zfs_btree_t;
+
+/*
+ * Allocate and deallocate caches for btree nodes.
+ */
+void zfs_btree_init(void);
+void zfs_btree_fini(void);
+
+/*
+ * Initialize an B-Tree. Arguments are:
+ *
+ * tree - the tree to be initialized
+ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
+ * -1 for <, 0 for ==, and +1 for >
+ * size - the value of sizeof(struct my_type)
+ */
+void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
+ size_t);
+
+/*
+ * Find a node with a matching value in the tree. Returns the matching node
+ * found. If not found, it returns NULL and then if "where" is not NULL it sets
+ * "where" for use with zfs_btree_insert() or zfs_btree_nearest().
+ *
+ * node - node that has the value being looked for
+ * where - position for use with zfs_btree_nearest() or zfs_btree_insert(),
+ * may be NULL
+ */
+void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *);
+
+/*
+ * Insert a node into the tree.
+ *
+ * node - the node to insert
+ * where - position as returned from zfs_btree_find()
+ */
+void zfs_btree_insert(zfs_btree_t *, const void *, const zfs_btree_index_t *);
+
+/*
+ * Return the first or last valued node in the tree. Will return NULL
+ * if the tree is empty.
+ */
+void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *);
+void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Return the next or previous valued node in the tree.
+ */
+void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *,
+ zfs_btree_index_t *);
+void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *,
+ zfs_btree_index_t *);
+
+/*
+ * Get a value from a tree and an index.
+ */
+void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Add a single value to the tree. The value must not compare equal to any
+ * other node already in the tree.
+ */
+void zfs_btree_add(zfs_btree_t *, const void *);
+
+/*
+ * Remove a single value from the tree. The value must be in the tree. The
+ * pointer passed in may be a pointer into a tree-controlled buffer, but it
+ * need not be.
+ */
+void zfs_btree_remove(zfs_btree_t *, const void *);
+
+/*
+ * Remove the value at the given location from the tree.
+ */
+void zfs_btree_remove_from(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Return the number of nodes in the tree
+ */
+ulong_t zfs_btree_numnodes(zfs_btree_t *);
+
+/*
+ * Used to destroy any remaining nodes in a tree. The cookie argument should
+ * be initialized to NULL before the first call. Returns a node that has been
+ * removed from the tree and may be free()'d. Returns NULL when the tree is
+ * empty.
+ *
+ * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it
+ * and finally zfs_btree_destroy(). No other B-Tree routines will be valid.
+ *
+ * cookie - an index used to save state between calls to
+ * zfs_btree_destroy_nodes()
+ *
+ * EXAMPLE:
+ * zfs_btree_t *tree;
+ * struct my_data *node;
+ * zfs_btree_index_t *cookie;
+ *
+ * cookie = NULL;
+ * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ * data_destroy(node);
+ * zfs_btree_destroy(tree);
+ */
+void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **);
+
+/*
+ * Destroys all nodes in the tree quickly. This doesn't give the caller an
+ * opportunity to iterate over each node and do its own cleanup; for that, use
+ * zfs_btree_destroy_nodes().
+ */
+void zfs_btree_clear(zfs_btree_t *);
+
+/*
+ * Final destroy of an B-Tree. Arguments are:
+ *
+ * tree - the empty tree to destroy
+ */
+void zfs_btree_destroy(zfs_btree_t *tree);
+
+/* Runs a variety of self-checks on the btree to verify integrity. */
+void zfs_btree_verify(zfs_btree_t *tree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BTREE_H */
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 00b8b4758..f8d9c6a82 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -95,8 +95,8 @@ void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
-void metaslab_alloc_trace_init(void);
-void metaslab_alloc_trace_fini(void);
+void metaslab_stat_init(void);
+void metaslab_stat_fini(void);
void metaslab_trace_init(zio_alloc_list_t *);
void metaslab_trace_fini(zio_alloc_list_t *);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 3ce39183e..d140f741d 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -509,8 +509,8 @@ struct metaslab {
* only difference is that the ms_allocatable_by_size is ordered by
* segment sizes.
*/
- avl_tree_t ms_allocatable_by_size;
- avl_tree_t ms_unflushed_frees_by_size;
+ zfs_btree_t ms_allocatable_by_size;
+ zfs_btree_t ms_unflushed_frees_by_size;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index e299613b8..d80faa933 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -30,7 +30,7 @@
#ifndef _SYS_RANGE_TREE_H
#define _SYS_RANGE_TREE_H
-#include <sys/avl.h>
+#include <sys/btree.h>
#include <sys/dmu.h>
#ifdef __cplusplus
@@ -41,20 +41,35 @@ extern "C" {
typedef struct range_tree_ops range_tree_ops_t;
+typedef enum range_seg_type {
+ RANGE_SEG32,
+ RANGE_SEG64,
+ RANGE_SEG_GAP,
+ RANGE_SEG_NUM_TYPES,
+} range_seg_type_t;
+
/*
* Note: the range_tree may not be accessed concurrently; consumers
* must provide external locking if required.
*/
typedef struct range_tree {
- avl_tree_t rt_root; /* offset-ordered segment AVL tree */
+ zfs_btree_t rt_root; /* offset-ordered segment b-tree */
uint64_t rt_space; /* sum of all segments in the map */
- uint64_t rt_gap; /* allowable inter-segment gap */
+ range_seg_type_t rt_type; /* type of range_seg_t in use */
+ /*
+ * All data that is stored in the range tree must have a start higher
+ * than or equal to rt_start, and all sizes and offsets must be
+ * multiples of 1 << rt_shift.
+ */
+ uint8_t rt_shift;
+ uint64_t rt_start;
range_tree_ops_t *rt_ops;
- /* rt_avl_compare should only be set if rt_arg is an AVL tree */
+ /* rt_btree_compare should only be set if rt_arg is a b-tree */
void *rt_arg;
- int (*rt_avl_compare)(const void *, const void *);
+ int (*rt_btree_compare)(const void *, const void *);
+ uint64_t rt_gap; /* allowable inter-segment gap */
/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -64,36 +79,217 @@ typedef struct range_tree {
uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
} range_tree_t;
-typedef struct range_seg {
- avl_node_t rs_node; /* AVL node */
- avl_node_t rs_pp_node; /* AVL picker-private node */
+typedef struct range_seg32 {
+ uint32_t rs_start; /* starting offset of this segment */
+ uint32_t rs_end; /* ending offset (non-inclusive) */
+} range_seg32_t;
+
+/*
+ * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may
+ * require 64-bit integers for ranges.
+ */
+typedef struct range_seg64 {
+ uint64_t rs_start; /* starting offset of this segment */
+ uint64_t rs_end; /* ending offset (non-inclusive) */
+} range_seg64_t;
+
+typedef struct range_seg_gap {
uint64_t rs_start; /* starting offset of this segment */
uint64_t rs_end; /* ending offset (non-inclusive) */
uint64_t rs_fill; /* actual fill if gap mode is on */
-} range_seg_t;
+} range_seg_gap_t;
+
+/*
+ * This type needs to be the largest of the range segs, since it will be stack
+ * allocated and then cast the actual type to do tree operations.
+ */
+typedef range_seg_gap_t range_seg_max_t;
+
+/*
+ * This is just for clarity of code purposes, so we can make it clear that a
+ * pointer is to a range seg of some type; when we need to do the actual math,
+ * we'll figure out the real type.
+ */
+typedef void range_seg_t;
struct range_tree_ops {
void (*rtop_create)(range_tree_t *rt, void *arg);
void (*rtop_destroy)(range_tree_t *rt, void *arg);
- void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
- void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
+ void (*rtop_add)(range_tree_t *rt, void *rs, void *arg);
+ void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg);
void (*rtop_vacate)(range_tree_t *rt, void *arg);
};
+static inline uint64_t
+rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ return (((range_seg32_t *)rs)->rs_start);
+ case RANGE_SEG64:
+ return (((range_seg64_t *)rs)->rs_start);
+ case RANGE_SEG_GAP:
+ return (((range_seg_gap_t *)rs)->rs_start);
+ default:
+ VERIFY(0);
+ return (0);
+ }
+}
+
+static inline uint64_t
+rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ return (((range_seg32_t *)rs)->rs_end);
+ case RANGE_SEG64:
+ return (((range_seg64_t *)rs)->rs_end);
+ case RANGE_SEG_GAP:
+ return (((range_seg_gap_t *)rs)->rs_end);
+ default:
+ VERIFY(0);
+ return (0);
+ }
+}
+
+static inline uint64_t
+rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32: {
+ const range_seg32_t *r32 = rs;
+ return (r32->rs_end - r32->rs_start);
+ }
+ case RANGE_SEG64: {
+ const range_seg64_t *r64 = rs;
+ return (r64->rs_end - r64->rs_start);
+ }
+ case RANGE_SEG_GAP:
+ return (((range_seg_gap_t *)rs)->rs_fill);
+ default:
+ VERIFY(0);
+ return (0);
+ }
+
+}
+
+static inline uint64_t
+rs_get_start(const range_seg_t *rs, const range_tree_t *rt)
+{
+ return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
+}
+
+static inline uint64_t
+rs_get_end(const range_seg_t *rs, const range_tree_t *rt)
+{
+ return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
+}
+
+static inline uint64_t
+rs_get_fill(const range_seg_t *rs, const range_tree_t *rt)
+{
+ return (rs_get_fill_raw(rs, rt) << rt->rt_shift);
+}
+
+static inline void
+rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ ASSERT3U(start, <=, UINT32_MAX);
+ ((range_seg32_t *)rs)->rs_start = (uint32_t)start;
+ break;
+ case RANGE_SEG64:
+ ((range_seg64_t *)rs)->rs_start = start;
+ break;
+ case RANGE_SEG_GAP:
+ ((range_seg_gap_t *)rs)->rs_start = start;
+ break;
+ default:
+ VERIFY(0);
+ }
+}
+
+static inline void
+rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ ASSERT3U(end, <=, UINT32_MAX);
+ ((range_seg32_t *)rs)->rs_end = (uint32_t)end;
+ break;
+ case RANGE_SEG64:
+ ((range_seg64_t *)rs)->rs_end = end;
+ break;
+ case RANGE_SEG_GAP:
+ ((range_seg_gap_t *)rs)->rs_end = end;
+ break;
+ default:
+ VERIFY(0);
+ }
+}
+
+static inline void
+rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ /* fall through */
+ case RANGE_SEG64:
+ ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs,
+ rt));
+ break;
+ case RANGE_SEG_GAP:
+ ((range_seg_gap_t *)rs)->rs_fill = fill;
+ break;
+ default:
+ VERIFY(0);
+ }
+}
+
+static inline void
+rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start)
+{
+ ASSERT3U(start, >=, rt->rt_start);
+ ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift));
+ rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift);
+}
+
+static inline void
+rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end)
+{
+ ASSERT3U(end, >=, rt->rt_start);
+ ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift));
+ rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift);
+}
+
+static inline void
+rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
+{
+ ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift));
+ rs_set_fill_raw(rs, rt, fill >> rt->rt_shift);
+}
+
typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
-void range_tree_init(void);
-void range_tree_fini(void);
-range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
- int (*avl_compare) (const void *, const void *), uint64_t gap);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops,
+ range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+ int (*zfs_btree_compare) (const void *, const void *), uint64_t gap);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+ void *arg, uint64_t start, uint64_t shift);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
uint64_t *ostart, uint64_t *osize);
void range_tree_verify_not_present(range_tree_t *rt,
uint64_t start, uint64_t size);
-range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
@@ -120,12 +316,12 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
range_tree_t *addto);
-void rt_avl_create(range_tree_t *rt, void *arg);
-void rt_avl_destroy(range_tree_t *rt, void *arg);
-void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_vacate(range_tree_t *rt, void *arg);
-extern struct range_tree_ops rt_avl_ops;
+void rt_btree_create(range_tree_t *rt, void *arg);
+void rt_btree_destroy(range_tree_t *rt, void *arg);
+void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_btree_vacate(range_tree_t *rt, void *arg);
+extern range_tree_ops_t rt_btree_ops;
#ifdef __cplusplus
}
diff --git a/include/sys/spa.h b/include/sys/spa.h
index caa2157b9..18933e0d4 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -43,6 +43,7 @@
#include <sys/spa_checksum.h>
#include <sys/dmu.h>
#include <sys/space_map.h>
+#include <sys/bitops.h>
#ifdef __cplusplus
extern "C" {
@@ -70,51 +71,6 @@ struct dsl_dataset;
struct dsl_crypto_params;
/*
- * General-purpose 32-bit and 64-bit bitfield encodings.
- */
-#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
-#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
-#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
-#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
-
-#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
-#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
-
-#define BF32_SET(x, low, len, val) do { \
- ASSERT3U(val, <, 1U << (len)); \
- ASSERT3U(low + len, <=, 32); \
- (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
-_NOTE(CONSTCOND) } while (0)
-
-#define BF64_SET(x, low, len, val) do { \
- ASSERT3U(val, <, 1ULL << (len)); \
- ASSERT3U(low + len, <=, 64); \
- ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
-_NOTE(CONSTCOND) } while (0)
-
-#define BF32_GET_SB(x, low, len, shift, bias) \
- ((BF32_GET(x, low, len) + (bias)) << (shift))
-#define BF64_GET_SB(x, low, len, shift, bias) \
- ((BF64_GET(x, low, len) + (bias)) << (shift))
-
-/*
- * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in
- * the case where val is a constant. We can't fix ASSERT because it's used as
- * an expression in several places in the kernel; as a result, changing it to
- * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option.
- */
-#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
- ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \
- ASSERT3S((val) >> (shift), >=, bias); \
- BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
- ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \
- ASSERT3S((val) >> (shift), >=, bias); \
- BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-
-/*
* We currently support block sizes from 512 bytes to 16MB.
* The benefits of larger blocks, and thus larger IO, need to be weighed
* against the cost of COWing a giant block to modify one byte, and the
diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h
index 249b15be6..ca9d41dc1 100644
--- a/include/sys/space_reftree.h
+++ b/include/sys/space_reftree.h
@@ -31,7 +31,7 @@
#define _SYS_SPACE_REFTREE_H
#include <sys/range_tree.h>
-
+#include <sys/avl.h>
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 0d0d0234e..c2fbcc549 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -96,8 +96,8 @@ extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
-extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
- range_seg_t *physical_rs);
+extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index c179191e3..ae82b75c0 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -86,8 +86,8 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
* Given a target vdev, translates the logical range "in" to the physical
* range "res"
*/
-typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
- range_seg_t *res);
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in,
+ range_seg64_t *res);
typedef const struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -526,8 +526,8 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
-extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
- range_seg_t *out);
+extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in,
+ range_seg64_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index ef638a498..e88ace53f 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -796,7 +796,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
- return (AVL_ISIGN(rv));
+ return (TREE_ISIGN(rv));
}
void
diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c
index d3ba58a9e..31b2ca30e 100644
--- a/lib/libzfs/libzfs_iter.c
+++ b/lib/libzfs/libzfs_iter.c
@@ -302,7 +302,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg)
lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
- return (AVL_CMP(lcreate, rcreate));
+ return (TREE_CMP(lcreate, rcreate));
}
int
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 0b6b31568..29cd24eed 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -511,7 +511,7 @@ fsavl_compare(const void *arg1, const void *arg2)
const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1;
const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2;
- return (AVL_CMP(fn1->fn_guid, fn2->fn_guid));
+ return (TREE_CMP(fn1->fn_guid, fn2->fn_guid));
}
/*
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 97f9abcf4..ca3038f7d 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -45,6 +45,7 @@ KERNEL_C = \
bplist.c \
bpobj.c \
bptree.c \
+ btree.c \
bqueue.c \
cityhash.c \
dbuf.c \
diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c
index e85ce4594..b8cdc118b 100644
--- a/lib/libzutil/zutil_import.c
+++ b/lib/libzutil/zutil_import.c
@@ -972,11 +972,11 @@ slice_cache_compare(const void *arg1, const void *arg2)
uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
int rv;
- rv = AVL_ISIGN(strcmp(nm1, nm2));
+ rv = TREE_ISIGN(strcmp(nm1, nm2));
if (rv)
return (rv);
- return (AVL_CMP(guid1, guid2));
+ return (TREE_CMP(guid1, guid2));
}
static int
diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c
index a4d336abe..ebfaae67b 100644
--- a/module/os/linux/zfs/zfs_znode.c
+++ b/module/os/linux/zfs/zfs_znode.c
@@ -247,7 +247,7 @@ zfs_znode_hold_compare(const void *a, const void *b)
const znode_hold_t *zh_a = (const znode_hold_t *)a;
const znode_hold_t *zh_b = (const znode_hold_t *)b;
- return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
+ return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
}
boolean_t
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index b60b799b5..c00c8e424 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -22,6 +22,7 @@ $(MODULE)-objs += blkptr.o
$(MODULE)-objs += bplist.o
$(MODULE)-objs += bpobj.o
$(MODULE)-objs += bptree.o
+$(MODULE)-objs += btree.o
$(MODULE)-objs += bqueue.o
$(MODULE)-objs += cityhash.o
$(MODULE)-objs += dataset_kstats.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 9b90690b6..c1ad8785d 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -4759,7 +4759,6 @@ arc_kmem_reap_soon(void)
kmem_cache_t *prev_data_cache = NULL;
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
- extern kmem_cache_t *range_seg_cache;
#ifdef _KERNEL
if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
@@ -4796,7 +4795,7 @@ arc_kmem_reap_soon(void)
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_full_cache);
kmem_cache_reap_now(hdr_l2only_cache);
- kmem_cache_reap_now(range_seg_cache);
+ kmem_cache_reap_now(zfs_btree_leaf_cache);
if (zio_arena != NULL) {
/*
diff --git a/module/zfs/btree.c b/module/zfs/btree.c
new file mode 100644
index 000000000..8f514caab
--- /dev/null
+++ b/module/zfs/btree.c
@@ -0,0 +1,2124 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/btree.h>
+#include <sys/bitops.h>
+#include <sys/zfs_context.h>
+
+kmem_cache_t *zfs_btree_leaf_cache;
+
+/*
+ * Control the extent of the verification that occurs when zfs_btree_verify is
+ * called. Primarily used for debugging when extending the btree logic and
+ * functionality. As the intensity is increased, new verification steps are
+ * added. These steps are cumulative; intensity = 3 includes the intensity = 1
+ * and intensity = 2 steps as well.
+ *
+ * Intensity 1: Verify that the tree's height is consistent throughout.
+ * Intensity 2: Verify that a core node's children's parent pointers point
+ * to the core node.
+ * Intensity 3: Verify that the total number of elements in the tree matches the
+ * sum of the number of elements in each node. Also verifies that each node's
+ * count obeys the invariants (less than or equal to maximum value, greater than
+ * or equal to half the maximum minus one).
+ * Intensity 4: Verify that each element compares less than the element
+ * immediately after it and greater than the one immediately before it using the
+ * comparator function. For core nodes, also checks that each element is greater
+ * than the last element in the first of the two nodes it separates, and less
+ * than the first element in the second of the two nodes.
+ * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
+ * of each node is poisoned appropriately. Note that poisoning always occurs if
+ * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
+ * operation.
+ *
+ * Intensity 4 and 5 are particularly expensive to perform; the previous levels
+ * are a few memory operations per node, while these levels require multiple
+ * operations per element. In addition, when creating large btrees, these
+ * operations are called at every step, resulting in extremely slow operation
+ * (while the asymptotic complexity of the other steps is the same, the
+ * importance of the constant factors cannot be denied).
+ */
+int zfs_btree_verify_intensity = 0;
+
+/*
+ * A convenience function to silence warnings from memmove's return value and
+ * change argument order to src, dest.
+ */
+void
+bmov(const void *src, void *dest, size_t size)
+{
+ (void) memmove(dest, src, size);
+}
+
+#ifdef _ILP32
+#define BTREE_POISON 0xabadb10c
+#else
+#define BTREE_POISON 0xabadb10cdeadbeef
+#endif
+
+static void
+zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+#ifdef ZFS_DEBUG
+ size_t size = tree->bt_elem_size;
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f,
+ BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) -
+ hdr->bth_count * size);
+ } else {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+ node->btc_children[i] =
+ (zfs_btree_hdr_t *)BTREE_POISON;
+ }
+ (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
+ (BTREE_CORE_ELEMS - hdr->bth_count) * size);
+ }
+#endif
+}
+
+static inline void
+zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+ size_t size = tree->bt_elem_size;
+ ASSERT3U(offset, >=, hdr->bth_count);
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ (void) memset(leaf->btl_elems + offset * size, 0x0f, size);
+ } else {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ node->btc_children[offset + 1] =
+ (zfs_btree_hdr_t *)BTREE_POISON;
+ (void) memset(node->btc_elems + offset * size, 0x0f, size);
+ }
+#endif
+}
+
+static inline void
+zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+ size_t size = tree->bt_elem_size;
+ uint8_t eval = 0x0f;
+ if (hdr->bth_core) {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
+ VERIFY3P(node->btc_children[offset + 1], ==, cval);
+ for (int i = 0; i < size; i++)
+ VERIFY3U(node->btc_elems[offset * size + i], ==, eval);
+ } else {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ for (int i = 0; i < size; i++)
+ VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval);
+ }
+#endif
+}
+
+void
+zfs_btree_init(void)
+{
+ zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
+ BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL,
+ NULL, 0);
+}
+
+void
+zfs_btree_fini(void)
+{
+ kmem_cache_destroy(zfs_btree_leaf_cache);
+}
+
+void
+zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
+ size_t size)
+{
+ /*
+ * We need a minimmum of 4 elements so that when we split a node we
+ * always have at least two elements in each node. This simplifies the
+ * logic in zfs_btree_bulk_finish, since it means the last leaf will
+ * always have a left sibling to share with (unless it's the root).
+ */
+ ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4);
+
+ bzero(tree, sizeof (*tree));
+ tree->bt_compar = compar;
+ tree->bt_elem_size = size;
+ tree->bt_height = -1;
+ tree->bt_bulk = NULL;
+}
+
+/*
+ * Find value in the array of elements provided. Uses a simple binary search.
+ */
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems,
+ const void *value, zfs_btree_index_t *where)
+{
+ uint64_t max = nelems;
+ uint64_t min = 0;
+ while (max > min) {
+ uint64_t idx = (min + max) / 2;
+ uint8_t *cur = buf + idx * tree->bt_elem_size;
+ int comp = tree->bt_compar(cur, value);
+ if (comp == -1) {
+ min = idx + 1;
+ } else if (comp == 1) {
+ max = idx;
+ } else {
+ ASSERT0(comp);
+ where->bti_offset = idx;
+ where->bti_before = B_FALSE;
+ return (cur);
+ }
+ }
+
+ where->bti_offset = max;
+ where->bti_before = B_TRUE;
+ return (NULL);
+}
+
+/*
+ * Find the given value in the tree. where may be passed as null to use as a
+ * membership test or if the btree is being used as a map.
+ */
+void *
+zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
+{
+ if (tree->bt_height == -1) {
+ if (where != NULL) {
+ where->bti_node = NULL;
+ where->bti_offset = 0;
+ }
+ ASSERT0(tree->bt_num_elems);
+ return (NULL);
+ }
+
+ /*
+ * If we're in bulk-insert mode, we check the last spot in the tree
+ * and the last leaf in the tree before doing the normal search,
+ * because for most workloads the vast majority of finds in
+ * bulk-insert mode are to insert new elements.
+ */
+ zfs_btree_index_t idx;
+ if (tree->bt_bulk != NULL) {
+ zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
+ int compar = tree->bt_compar(last_leaf->btl_elems +
+ ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size),
+ value);
+ if (compar < 0) {
+ /*
+ * If what they're looking for is after the last
+ * element, it's not in the tree.
+ */
+ if (where != NULL) {
+ where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+ where->bti_offset =
+ last_leaf->btl_hdr.bth_count;
+ where->bti_before = B_TRUE;
+ }
+ return (NULL);
+ } else if (compar == 0) {
+ if (where != NULL) {
+ where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+ where->bti_offset =
+ last_leaf->btl_hdr.bth_count - 1;
+ where->bti_before = B_FALSE;
+ }
+ return (last_leaf->btl_elems +
+ ((last_leaf->btl_hdr.bth_count - 1) *
+ tree->bt_elem_size));
+ }
+ if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) {
+ /*
+ * If what they're looking for is after the first
+ * element in the last leaf, it's in the last leaf or
+ * it's not in the tree.
+ */
+ void *d = zfs_btree_find_in_buf(tree,
+ last_leaf->btl_elems, last_leaf->btl_hdr.bth_count,
+ value, &idx);
+
+ if (where != NULL) {
+ idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
+ *where = idx;
+ }
+ return (d);
+ }
+ }
+
+ zfs_btree_core_t *node = NULL;
+ uint64_t child = 0;
+ uint64_t depth = 0;
+
+ /*
+ * Iterate down the tree, finding which child the value should be in
+ * by comparing with the separators.
+ */
+ for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
+ node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
+ ASSERT3P(node, !=, NULL);
+ void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+ node->btc_hdr.bth_count, value, &idx);
+ EQUIV(d != NULL, !idx.bti_before);
+ if (d != NULL) {
+ if (where != NULL) {
+ idx.bti_node = (zfs_btree_hdr_t *)node;
+ *where = idx;
+ }
+ return (d);
+ }
+ ASSERT(idx.bti_before);
+ child = idx.bti_offset;
+ }
+
+ /*
+ * The value is in this leaf, or it would be if it were in the
+ * tree. Find its proper location and return it.
+ */
+ zfs_btree_leaf_t *leaf = (depth == 0 ?
+ (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
+ void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems,
+ leaf->btl_hdr.bth_count, value, &idx);
+
+ if (where != NULL) {
+ idx.bti_node = (zfs_btree_hdr_t *)leaf;
+ *where = idx;
+ }
+
+ return (d);
+}
+
+/*
+ * To explain the following functions, it is useful to understand the four
+ * kinds of shifts used in btree operation. First, a shift is a movement of
+ * elements within a node. It is used to create gaps for inserting new
+ * elements and children, or cover gaps created when things are removed. A
+ * shift has two fundamental properties, each of which can be one of two
+ * values, making four types of shifts. There is the direction of the shift
+ * (left or right) and the shape of the shift (parallelogram or isoceles
+ * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
+ * applies to shifts of core nodes.
+ *
+ * The names derive from the following imagining of the layout of a node:
+ *
+ * Elements: * * * * * * * ... * * *
+ * Children: * * * * * * * * ... * * *
+ *
+ * This layout follows from the fact that the elements act as separators
+ * between pairs of children, and that children root subtrees "below" the
+ * current node. A left and right shift are fairly self-explanatory; a left
+ * shift moves things to the left, while a right shift moves things to the
+ * right. A parallelogram shift is a shift with the same number of elements
+ * and children being moved, while a trapezoid shift is a shift that moves one
+ * more children than elements. An example follows:
+ *
+ * A parallelogram shift could contain the following:
+ * _______________
+ * \* * * * \ * * * ... * * *
+ * * \ * * * *\ * * * ... * * *
+ * ---------------
+ * A trapezoid shift could contain the following:
+ * ___________
+ * * / * * * \ * * * ... * * *
+ * * / * * * *\ * * * ... * * *
+ * ---------------
+ *
+ * Note that a parellelogram shift is always shaped like a "left-leaning"
+ * parallelogram, where the starting index of the children being moved is
+ * always one higher than the starting index of the elements being moved. No
+ * "right-leaning" parallelogram shifts are needed (shifts where the starting
+ * element index and starting child index being moved are the same) to achieve
+ * any btree operations, so we ignore them.
+ */
+
+enum bt_shift_shape {
+ BSS_TRAPEZOID,
+ BSS_PARALLELOGRAM
+};
+
+enum bt_shift_direction {
+ BSD_LEFT,
+ BSD_RIGHT
+};
+
+/*
+ * Shift elements and children in the provided core node by off spots. The
+ * first element moved is idx, and count elements are moved. The shape of the
+ * shift is determined by shape. The direction is determined by dir.
+ */
+static inline void
+bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+ uint64_t count, uint64_t off, enum bt_shift_shape shape,
+ enum bt_shift_direction dir)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(node->btc_hdr.bth_core);
+
+ uint8_t *e_start = node->btc_elems + idx * size;
+ int sign = (dir == BSD_LEFT ? -1 : +1);
+ uint8_t *e_out = e_start + sign * off * size;
+ uint64_t e_count = count;
+ bmov(e_start, e_out, e_count * size);
+
+ zfs_btree_hdr_t **c_start = node->btc_children + idx +
+ (shape == BSS_TRAPEZOID ? 0 : 1);
+ zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
+ c_start + off);
+ uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+ bmov(c_start, c_out, c_count * sizeof (*c_start));
+}
+
+/*
+ * Shift elements and children in the provided core node left by one spot.
+ * The first element moved is idx, and count elements are moved. The
+ * shape of the shift is determined by trap; true if the shift is a trapezoid,
+ * false if it is a parallelogram.
+ */
+static inline void
+bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+ uint64_t count, enum bt_shift_shape shape)
+{
+ bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
+}
+
+/*
+ * Shift elements and children in the provided core node right by one spot.
+ * Starts with elements[idx] and children[idx] and one more child than element.
+ */
+static inline void
+bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+ uint64_t count, enum bt_shift_shape shape)
+{
+ bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
+}
+
+/*
+ * Shift elements and children in the provided leaf node by off spots.
+ * The first element moved is idx, and count elements are moved. The direction
+ * is determined by left.
+ */
+static inline void
+bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx,
+ uint64_t count, uint64_t off, enum bt_shift_direction dir)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(!node->btl_hdr.bth_core);
+
+ uint8_t *start = node->btl_elems + idx * size;
+ int sign = (dir == BSD_LEFT ? -1 : +1);
+ uint8_t *out = start + sign * off * size;
+ bmov(start, out, count * size);
+}
+
+static inline void
+bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+ uint64_t count)
+{
+ bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT);
+}
+
+static inline void
+bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+ uint64_t count)
+{
+ bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT);
+}
+
+/*
+ * Move children and elements from one core node to another. The shape
+ * parameter behaves the same as it does in the shift logic.
+ */
+static inline void
+bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx,
+ uint64_t count, zfs_btree_core_t *dest, uint64_t didx,
+ enum bt_shift_shape shape)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(source->btc_hdr.bth_core);
+ ASSERT(dest->btc_hdr.bth_core);
+
+ bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
+ count * size);
+
+ uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+ bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
+ dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
+ c_count * sizeof (*source->btc_children));
+}
+
+static inline void
+bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
+ uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx)
+{
+ size_t size = tree->bt_elem_size;
+ ASSERT(!source->btl_hdr.bth_core);
+ ASSERT(!dest->btl_hdr.bth_core);
+
+ bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size,
+ count * size);
+}
+
+/*
+ * Find the first element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where)
+{
+ zfs_btree_hdr_t *node;
+
+ for (node = hdr; node->bth_core; node =
+ ((zfs_btree_core_t *)node)->btc_children[0])
+ ;
+
+ ASSERT(!node->bth_core);
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+ if (where != NULL) {
+ where->bti_node = node;
+ where->bti_offset = 0;
+ where->bti_before = B_FALSE;
+ }
+ return (&leaf->btl_elems[0]);
+}
+
+/* Insert an element and a child into a core node at the given offset. */
+static void
+zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
+ uint64_t offset, zfs_btree_hdr_t *new_node, void *buf)
+{
+ uint64_t size = tree->bt_elem_size;
+ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+ ASSERT3P(par_hdr, ==, new_node->bth_parent);
+ ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ zfs_btree_verify_poison_at(tree, par_hdr,
+ par_hdr->bth_count);
+ }
+ /* Shift existing elements and children */
+ uint64_t count = par_hdr->bth_count - offset;
+ bt_shift_core_right(tree, parent, offset, count,
+ BSS_PARALLELOGRAM);
+
+ /* Insert new values */
+ parent->btc_children[offset + 1] = new_node;
+ bmov(buf, parent->btc_elems + offset * size, size);
+ par_hdr->bth_count++;
+}
+
+/*
+ * Insert new_node into the parent of old_node directly after old_node, with
+ * buf as the dividing element between the two.
+ */
+static void
+zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
+ zfs_btree_hdr_t *new_node, void *buf)
+{
+ ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
+ uint64_t size = tree->bt_elem_size;
+ zfs_btree_core_t *parent = old_node->bth_parent;
+ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+
+ /*
+ * If this is the root node we were splitting, we create a new root
+ * and increase the height of the tree.
+ */
+ if (parent == NULL) {
+ ASSERT3P(old_node, ==, tree->bt_root);
+ tree->bt_num_nodes++;
+ zfs_btree_core_t *new_root =
+ kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
+ size, KM_SLEEP);
+ zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
+ new_root_hdr->bth_parent = NULL;
+ new_root_hdr->bth_core = B_TRUE;
+ new_root_hdr->bth_count = 1;
+
+ old_node->bth_parent = new_node->bth_parent = new_root;
+ new_root->btc_children[0] = old_node;
+ new_root->btc_children[1] = new_node;
+ bmov(buf, new_root->btc_elems, size);
+
+ tree->bt_height++;
+ tree->bt_root = new_root_hdr;
+ zfs_btree_poison_node(tree, new_root_hdr);
+ return;
+ }
+
+ /*
+ * Since we have the new separator, binary search for where to put
+ * new_node.
+ */
+ zfs_btree_index_t idx;
+ ASSERT(par_hdr->bth_core);
+ VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ par_hdr->bth_count, buf, &idx), ==, NULL);
+ ASSERT(idx.bti_before);
+ uint64_t offset = idx.bti_offset;
+ ASSERT3U(offset, <=, par_hdr->bth_count);
+ ASSERT3P(parent->btc_children[offset], ==, old_node);
+
+ /*
+ * If the parent isn't full, shift things to accomodate our insertions
+ * and return.
+ */
+ if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
+ zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
+ return;
+ }
+
+ /*
+ * We need to split this core node into two. Currently there are
+ * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
+ * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
+ * current node, and the others will be moved to the new core node.
+ * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
+ * will be used as the new separator in our parent, and the others
+ * will be split among the two core nodes.
+ *
+ * Usually we will split the node in half evenly, with
+ * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
+ * instead move only about a quarter of the elements (and children) to
+ * the new node. Since the average state after a long time is a 3/4
+ * full node, shortcutting directly to that state improves efficiency.
+ *
+ * We do this in two stages: first we split into two nodes, and then we
+ * reuse our existing logic to insert the new element and child.
+ */
+ uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
+ 2 : 4)) - 1, 2);
+ uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
+ ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
+ tree->bt_num_nodes++;
+ zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
+ BTREE_CORE_ELEMS * size, KM_SLEEP);
+ zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
+ new_par_hdr->bth_parent = par_hdr->bth_parent;
+ new_par_hdr->bth_core = B_TRUE;
+ new_par_hdr->bth_count = move_count;
+ zfs_btree_poison_node(tree, new_par_hdr);
+
+ par_hdr->bth_count = keep_count;
+
+ bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
+ 0, BSS_TRAPEZOID);
+
+ /* Store the new separator in a buffer. */
+ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
+ bmov(parent->btc_elems + keep_count * size, tmp_buf,
+ size);
+ zfs_btree_poison_node(tree, par_hdr);
+
+ if (offset < keep_count) {
+ /* Insert the new node into the left half */
+ zfs_btree_insert_core_impl(tree, parent, offset, new_node,
+ buf);
+
+ /*
+ * Move the new separator to the existing buffer.
+ */
+ bmov(tmp_buf, buf, size);
+ } else if (offset > keep_count) {
+ /* Insert the new node into the right half */
+ new_node->bth_parent = new_parent;
+ zfs_btree_insert_core_impl(tree, new_parent,
+ offset - keep_count - 1, new_node, buf);
+
+ /*
+ * Move the new separator to the existing buffer.
+ */
+ bmov(tmp_buf, buf, size);
+ } else {
+ /*
+ * Move the new separator into the right half, and replace it
+ * with buf. We also need to shift back the elements in the
+ * right half to accomodate new_node.
+ */
+ bt_shift_core_right(tree, new_parent, 0, move_count,
+ BSS_TRAPEZOID);
+ new_parent->btc_children[0] = new_node;
+ bmov(tmp_buf, new_parent->btc_elems, size);
+ new_par_hdr->bth_count++;
+ }
+ kmem_free(tmp_buf, size);
+ zfs_btree_poison_node(tree, par_hdr);
+
+ for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++)
+ new_parent->btc_children[i]->bth_parent = new_parent;
+
+ for (int i = 0; i <= parent->btc_hdr.bth_count; i++)
+ ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
+
+ /*
+ * Now that the node is split, we need to insert the new node into its
+ * parent. This may cause further splitting.
+ */
+ zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
+ &new_parent->btc_hdr, buf);
+}
+
+/* Insert an element into a leaf node at the given offset. */
+static void
+zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+ uint64_t idx, const void *value)
+{
+ uint64_t size = tree->bt_elem_size;
+ uint8_t *start = leaf->btl_elems + (idx * size);
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ ASSERTV(uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2));
+ uint64_t count = leaf->btl_hdr.bth_count - idx;
+ ASSERT3U(leaf->btl_hdr.bth_count, <, capacity);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
+ leaf->btl_hdr.bth_count);
+ }
+
+ bt_shift_leaf_right(tree, leaf, idx, count);
+ bmov(value, start, size);
+ hdr->bth_count++;
+}
+
+/* Helper function for inserting a new value into leaf at the given index. */
+static void
+zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+ const void *value, uint64_t idx)
+{
+ uint64_t size = tree->bt_elem_size;
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+
+ /*
+ * If the leaf isn't full, shift the elements after idx and insert
+ * value.
+ */
+ if (leaf->btl_hdr.bth_count != capacity) {
+ zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+ return;
+ }
+
+ /*
+ * Otherwise, we split the leaf node into two nodes. If we're not bulk
+ * inserting, each is of size (capacity / 2). If we are bulk
+ * inserting, we move a quarter of the elements to the new node so
+ * inserts into the old node don't cause immediate splitting but the
+ * tree stays relatively dense. Since the average state after a long
+ * time is a 3/4 full node, shortcutting directly to that state
+ * improves efficiency. At the end of the bulk insertion process
+ * we'll need to go through and fix up any nodes (the last leaf and
+ * its ancestors, potentially) that are below the minimum.
+ *
+ * In either case, we're left with one extra element. The leftover
+ * element will become the new dividing element between the two nodes.
+ */
+ uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) -
+ 1, 2);
+ uint64_t keep_count = capacity - move_count - 1;
+ ASSERT3U(capacity - move_count, >=, 2);
+ tree->bt_num_nodes++;
+ zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+ KM_SLEEP);
+ zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
+ new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
+ new_hdr->bth_core = B_FALSE;
+ new_hdr->bth_count = move_count;
+ zfs_btree_poison_node(tree, new_hdr);
+
+ leaf->btl_hdr.bth_count = keep_count;
+
+ if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
+ tree->bt_bulk = new_leaf;
+
+ /* Copy the back part to the new leaf. */
+ bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf,
+ 0);
+
+ /* We store the new separator in a buffer we control for simplicity. */
+ uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+ bmov(leaf->btl_elems + (keep_count * size), buf, size);
+ zfs_btree_poison_node(tree, &leaf->btl_hdr);
+
+ if (idx < keep_count) {
+ /* Insert into the existing leaf. */
+ zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+ } else if (idx > keep_count) {
+ /* Insert into the new leaf. */
+ zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
+ 1, value);
+ } else {
+ /*
+ * Shift the elements in the new leaf to make room for the
+ * separator, and use the new value as the new separator.
+ */
+ bt_shift_leaf_right(tree, new_leaf, 0, move_count);
+ bmov(buf, new_leaf->btl_elems, size);
+ bmov(value, buf, size);
+ new_hdr->bth_count++;
+ }
+
+ /*
+ * Now that the node is split, we need to insert the new node into its
+ * parent. This may cause further splitting, bur only of core nodes.
+ */
+ zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
+ buf);
+ kmem_free(buf, size);
+}
+
+static uint64_t
+zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ void *buf;
+ if (hdr->bth_core) {
+ buf = ((zfs_btree_core_t *)hdr)->btc_elems;
+ } else {
+ buf = ((zfs_btree_leaf_t *)hdr)->btl_elems;
+ }
+ zfs_btree_index_t idx;
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
+ ASSERT(idx.bti_before);
+ ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
+ ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
+ return (idx.bti_offset);
+}
+
+/*
+ * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
+ * nodes may violate the invariant that non-root nodes must be at least half
+ * full. All nodes violating this invariant should be the last node in their
+ * particular level. To correct the invariant, we take values from their left
+ * neighbor until they are half full. They must have a left neighbor at their
+ * level because the last node at a level is not the first node unless it's
+ * the root.
+ */
+static void
+zfs_btree_bulk_finish(zfs_btree_t *tree)
+{
+ ASSERT3P(tree->bt_bulk, !=, NULL);
+ ASSERT3P(tree->bt_root, !=, NULL);
+ zfs_btree_leaf_t *leaf = tree->bt_bulk;
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ uint64_t size = tree->bt_elem_size;
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+
+ /*
+ * The invariant doesn't apply to the root node, if that's the only
+ * node in the tree we're done.
+ */
+ if (parent == NULL) {
+ tree->bt_bulk = NULL;
+ return;
+ }
+
+ /* First, take elements to rebalance the leaf node. */
+ if (hdr->bth_count < capacity / 2) {
+ /*
+ * First, find the left neighbor. The simplest way to do this
+ * is to call zfs_btree_prev twice; the first time finds some
+ * ancestor of this node, and the second time finds the left
+ * neighbor. The ancestor found is the lowest common ancestor
+ * of leaf and the neighbor.
+ */
+ zfs_btree_index_t idx = {
+ .bti_node = hdr,
+ .bti_offset = 0
+ };
+ VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+ ASSERT(idx.bti_node->bth_core);
+ zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
+ uint64_t common_idx = idx.bti_offset;
+
+ VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+ ASSERT(!idx.bti_node->bth_core);
+ zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
+ zfs_btree_hdr_t *l_hdr = idx.bti_node;
+ uint64_t move_count = (capacity / 2) - hdr->bth_count;
+ ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
+ capacity / 2);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < move_count; i++) {
+ zfs_btree_verify_poison_at(tree, hdr,
+ leaf->btl_hdr.bth_count + i);
+ }
+ }
+
+ /* First, shift elements in leaf back. */
+ bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count,
+ BSD_RIGHT);
+
+ /* Next, move the separator from the common ancestor to leaf. */
+ uint8_t *separator = common->btc_elems + (common_idx * size);
+ uint8_t *out = leaf->btl_elems + ((move_count - 1) * size);
+ bmov(separator, out, size);
+ move_count--;
+
+ /*
+ * Now we move elements from the tail of the left neighbor to
+ * fill the remaining spots in leaf.
+ */
+ bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
+ move_count, move_count, leaf, 0);
+
+ /*
+ * Finally, move the new last element in the left neighbor to
+ * the separator.
+ */
+ bmov(l_neighbor->btl_elems + (l_hdr->bth_count -
+ move_count - 1) * size, separator, size);
+
+ /* Adjust the node's counts, and we're done. */
+ l_hdr->bth_count -= move_count + 1;
+ hdr->bth_count += move_count + 1;
+
+ ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
+ ASSERT3U(hdr->bth_count, >=, capacity / 2);
+ zfs_btree_poison_node(tree, l_hdr);
+ }
+
+ /*
+ * Now we have to rebalance any ancestors of leaf that may also
+ * violate the invariant.
+ */
+ capacity = BTREE_CORE_ELEMS;
+ while (parent->btc_hdr.bth_parent != NULL) {
+ zfs_btree_core_t *cur = parent;
+ zfs_btree_hdr_t *hdr = &cur->btc_hdr;
+ parent = hdr->bth_parent;
+ /*
+ * If the invariant isn't violated, move on to the next
+ * ancestor.
+ */
+ if (hdr->bth_count >= capacity / 2)
+ continue;
+
+ /*
+ * Because the smallest number of nodes we can move when
+ * splitting is 2, we never need to worry about not having a
+ * left sibling (a sibling is a neighbor with the same parent).
+ */
+ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+ ASSERT3U(parent_idx, >, 0);
+ zfs_btree_core_t *l_neighbor =
+ (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
+ uint64_t move_count = (capacity / 2) - hdr->bth_count;
+ ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
+ capacity / 2);
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < move_count; i++) {
+ zfs_btree_verify_poison_at(tree, hdr,
+ hdr->bth_count + i);
+ }
+ }
+ /* First, shift things in the right node back. */
+ bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
+ BSS_TRAPEZOID, BSD_RIGHT);
+
+ /* Next, move the separator to the right node. */
+ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
+ size);
+ uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
+ bmov(separator, e_out, size);
+
+ /*
+ * Now, move elements and children from the left node to the
+ * right. We move one more child than elements.
+ */
+ move_count--;
+ uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
+ bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
+ BSS_TRAPEZOID);
+
+ /*
+ * Finally, move the last element in the left node to the
+ * separator's position.
+ */
+ move_idx--;
+ bmov(l_neighbor->btc_elems + move_idx * size, separator, size);
+
+ l_neighbor->btc_hdr.bth_count -= move_count + 1;
+ hdr->bth_count += move_count + 1;
+
+ ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
+ ASSERT3U(hdr->bth_count, >=, capacity / 2);
+
+ zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
+
+ for (int i = 0; i <= hdr->bth_count; i++)
+ cur->btc_children[i]->bth_parent = cur;
+ }
+
+ tree->bt_bulk = NULL;
+}
+
+/*
+ * Insert value into tree at the location specified by where.
+ */
+void
+zfs_btree_insert(zfs_btree_t *tree, const void *value,
+ const zfs_btree_index_t *where)
+{
+ zfs_btree_index_t idx = {0};
+
+ /* If we're not inserting in the last leaf, end bulk insert mode. */
+ if (tree->bt_bulk != NULL) {
+ if (where->bti_node != &tree->bt_bulk->btl_hdr) {
+ zfs_btree_bulk_finish(tree);
+ VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
+ where = &idx;
+ }
+ }
+
+ tree->bt_num_elems++;
+ /*
+ * If this is the first element in the tree, create a leaf root node
+ * and add the value to it.
+ */
+ if (where->bti_node == NULL) {
+ ASSERT3U(tree->bt_num_elems, ==, 1);
+ ASSERT3S(tree->bt_height, ==, -1);
+ ASSERT3P(tree->bt_root, ==, NULL);
+ ASSERT0(where->bti_offset);
+
+ tree->bt_num_nodes++;
+ zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+ KM_SLEEP);
+ tree->bt_root = &leaf->btl_hdr;
+ tree->bt_height++;
+
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ hdr->bth_parent = NULL;
+ hdr->bth_core = B_FALSE;
+ hdr->bth_count = 0;
+ zfs_btree_poison_node(tree, hdr);
+
+ zfs_btree_insert_into_leaf(tree, leaf, value, 0);
+ tree->bt_bulk = leaf;
+ } else if (!where->bti_node->bth_core) {
+ /*
+ * If we're inserting into a leaf, go directly to the helper
+ * function.
+ */
+ zfs_btree_insert_into_leaf(tree,
+ (zfs_btree_leaf_t *)where->bti_node, value,
+ where->bti_offset);
+ } else {
+ /*
+ * If we're inserting into a core node, we can't just shift
+ * the existing element in that slot in the same node without
+ * breaking our ordering invariants. Instead we place the new
+ * value in the node at that spot and then insert the old
+ * separator into the first slot in the subtree to the right.
+ */
+ ASSERT(where->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
+
+ /*
+ * We can ignore bti_before, because either way the value
+ * should end up in bti_offset.
+ */
+ uint64_t off = where->bti_offset;
+ zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
+ size_t size = tree->bt_elem_size;
+ uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+ bmov(node->btc_elems + off * size, buf, size);
+ bmov(value, node->btc_elems + off * size, size);
+
+ /*
+ * Find the first slot in the subtree to the right, insert
+ * there.
+ */
+ zfs_btree_index_t new_idx;
+ VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL);
+ ASSERT0(new_idx.bti_offset);
+ ASSERT(!new_idx.bti_node->bth_core);
+ zfs_btree_insert_into_leaf(tree,
+ (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
+ kmem_free(buf, size);
+ }
+ zfs_btree_verify(tree);
+}
+
+/*
+ * Return the first element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+ if (tree->bt_height == -1) {
+ ASSERT0(tree->bt_num_elems);
+ return (NULL);
+ }
+ return (zfs_btree_first_helper(tree->bt_root, where));
+}
+
+/*
+ * Find the last element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
+ zfs_btree_index_t *where)
+{
+ zfs_btree_hdr_t *node;
+
+ for (node = hdr; node->bth_core; node =
+ ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
+ ;
+
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+ if (where != NULL) {
+ where->bti_node = node;
+ where->bti_offset = node->bth_count - 1;
+ where->bti_before = B_FALSE;
+ }
+ return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size);
+}
+
+/*
+ * Return the last element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+ if (tree->bt_height == -1) {
+ ASSERT0(tree->bt_num_elems);
+ return (NULL);
+ }
+ return (zfs_btree_last_helper(tree, tree->bt_root, where));
+}
+
+/*
+ * This function contains the logic to find the next node in the tree. A
+ * helper function is used because there are multiple internal consumemrs of
+ * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
+ * node after we've finished with it.
+ */
+static void *
+zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+ zfs_btree_index_t *out_idx,
+ void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
+{
+ if (idx->bti_node == NULL) {
+ ASSERT3S(tree->bt_height, ==, -1);
+ return (NULL);
+ }
+
+ uint64_t offset = idx->bti_offset;
+ if (!idx->bti_node->bth_core) {
+ /*
+ * When finding the next element of an element in a leaf,
+ * there are two cases. If the element isn't the last one in
+ * the leaf, in which case we just return the next element in
+ * the leaf. Otherwise, we need to traverse up our parents
+ * until we find one where our ancestor isn't the last child
+ * of its parent. Once we do, the next element is the
+ * separator after our ancestor in its parent.
+ */
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+ uint64_t new_off = offset + (idx->bti_before ? 0 : 1);
+ if (leaf->btl_hdr.bth_count > new_off) {
+ out_idx->bti_node = &leaf->btl_hdr;
+ out_idx->bti_offset = new_off;
+ out_idx->bti_before = B_FALSE;
+ return (leaf->btl_elems + new_off * tree->bt_elem_size);
+ }
+
+ zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+ for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+ node != NULL; node = node->btc_hdr.bth_parent) {
+ zfs_btree_hdr_t *hdr = &node->btc_hdr;
+ ASSERT(hdr->bth_core);
+ uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+ if (done_func != NULL)
+ done_func(tree, prev);
+ if (i == hdr->bth_count) {
+ prev = hdr;
+ continue;
+ }
+ out_idx->bti_node = hdr;
+ out_idx->bti_offset = i;
+ out_idx->bti_before = B_FALSE;
+ return (node->btc_elems + i * tree->bt_elem_size);
+ }
+ if (done_func != NULL)
+ done_func(tree, prev);
+ /*
+ * We've traversed all the way up and been at the end of the
+ * node every time, so this was the last element in the tree.
+ */
+ return (NULL);
+ }
+
+ /* If we were before an element in a core node, return that element. */
+ ASSERT(idx->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+ if (idx->bti_before) {
+ out_idx->bti_before = B_FALSE;
+ return (node->btc_elems + offset * tree->bt_elem_size);
+ }
+
+ /*
+ * The next element from one in a core node is the first element in
+ * the subtree just to the right of the separator.
+ */
+ zfs_btree_hdr_t *child = node->btc_children[offset + 1];
+ return (zfs_btree_first_helper(child, out_idx));
+}
+
+/*
+ * Return the next valued node in the tree. The same address can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+ zfs_btree_index_t *out_idx)
+{
+ return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
+}
+
+/*
+ * Return the previous valued node in the tree. The same value can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+ zfs_btree_index_t *out_idx)
+{
+ if (idx->bti_node == NULL) {
+ ASSERT3S(tree->bt_height, ==, -1);
+ return (NULL);
+ }
+
+ uint64_t offset = idx->bti_offset;
+ if (!idx->bti_node->bth_core) {
+ /*
+ * When finding the previous element of an element in a leaf,
+ * there are two cases. If the element isn't the first one in
+ * the leaf, in which case we just return the previous element
+ * in the leaf. Otherwise, we need to traverse up our parents
+ * until we find one where our previous ancestor isn't the
+ * first child. Once we do, the previous element is the
+ * separator after our previous ancestor.
+ */
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+ if (offset != 0) {
+ out_idx->bti_node = &leaf->btl_hdr;
+ out_idx->bti_offset = offset - 1;
+ out_idx->bti_before = B_FALSE;
+ return (leaf->btl_elems + (offset - 1) *
+ tree->bt_elem_size);
+ }
+ zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+ for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+ node != NULL; node = node->btc_hdr.bth_parent) {
+ zfs_btree_hdr_t *hdr = &node->btc_hdr;
+ ASSERT(hdr->bth_core);
+ uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+ if (i == 0) {
+ prev = hdr;
+ continue;
+ }
+ out_idx->bti_node = hdr;
+ out_idx->bti_offset = i - 1;
+ out_idx->bti_before = B_FALSE;
+ return (node->btc_elems + (i - 1) * tree->bt_elem_size);
+ }
+ /*
+ * We've traversed all the way up and been at the start of the
+ * node every time, so this was the first node in the tree.
+ */
+ return (NULL);
+ }
+
+ /*
+ * The previous element from one in a core node is the last element in
+ * the subtree just to the left of the separator.
+ */
+ ASSERT(idx->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+ zfs_btree_hdr_t *child = node->btc_children[offset];
+ return (zfs_btree_last_helper(tree, child, out_idx));
+}
+
+/*
+ * Get the value at the provided index in the tree.
+ *
+ * Note that the value returned from this function can be mutated, but only
+ * if it will not change the ordering of the element with respect to any other
+ * elements that could be in the tree.
+ */
+void *
+zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
+{
+ ASSERT(!idx->bti_before);
+ if (!idx->bti_node->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+ return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size);
+ }
+ ASSERT(idx->bti_node->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+ return (node->btc_elems + idx->bti_offset * tree->bt_elem_size);
+}
+
+/* Add the given value to the tree. Must not already be in the tree. */
+void
+zfs_btree_add(zfs_btree_t *tree, const void *node)
+{
+ zfs_btree_index_t where = {0};
+ VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
+ zfs_btree_insert(tree, node, &where);
+}
+
+/* Helper function to free a tree node. */
+static void
+zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
+{
+ tree->bt_num_nodes--;
+ if (!node->bth_core) {
+ kmem_cache_free(zfs_btree_leaf_cache, node);
+ } else {
+ kmem_free(node, sizeof (zfs_btree_core_t) +
+ BTREE_CORE_ELEMS * tree->bt_elem_size);
+ }
+}
+
+/*
+ * Remove the rm_hdr and the separator to its left from the parent node. The
+ * buffer that rm_hdr was stored in may already be freed, so its contents
+ * cannot be accessed.
+ */
+static void
+zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
+ zfs_btree_hdr_t *rm_hdr)
+{
+ size_t size = tree->bt_elem_size;
+ uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
+ zfs_btree_hdr_t *hdr = &node->btc_hdr;
+ /*
+ * If the node is the root node and rm_hdr is one of two children,
+ * promote the other child to the root.
+ */
+ if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
+ ASSERT3U(hdr->bth_count, ==, 1);
+ ASSERT3P(tree->bt_root, ==, node);
+ ASSERT3P(node->btc_children[1], ==, rm_hdr);
+ tree->bt_root = node->btc_children[0];
+ node->btc_children[0]->bth_parent = NULL;
+ zfs_btree_node_destroy(tree, hdr);
+ tree->bt_height--;
+ return;
+ }
+
+ uint64_t idx;
+ for (idx = 0; idx <= hdr->bth_count; idx++) {
+ if (node->btc_children[idx] == rm_hdr)
+ break;
+ }
+ ASSERT3U(idx, <=, hdr->bth_count);
+
+ /*
+ * If the node is the root or it has more than the minimum number of
+ * children, just remove the child and separator, and return.
+ */
+ if (hdr->bth_parent == NULL ||
+ hdr->bth_count > min_count) {
+ /*
+ * Shift the element and children to the right of rm_hdr to
+ * the left by one spot.
+ */
+ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+ BSS_PARALLELOGRAM);
+ hdr->bth_count--;
+ zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+ return;
+ }
+
+ ASSERT3U(hdr->bth_count, ==, min_count);
+
+ /*
+ * Now we try to take a node from a neighbor. We check left, then
+ * right. If the neighbor exists and has more than the minimum number
+ * of elements, we move the separator betweeen us and them to our
+ * node, move their closest element (last for left, first for right)
+ * to the separator, and move their closest child to our node. Along
+ * the way we need to collapse the gap made by idx, and (for our right
+ * neighbor) the gap made by removing their first element and child.
+ *
+ * Note: this logic currently doesn't support taking from a neighbor
+ * that isn't a sibling (i.e. a neighbor with a different
+ * parent). This isn't critical functionality, but may be worth
+ * implementing in the future for completeness' sake.
+ */
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+ zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+ parent->btc_children[parent_idx - 1]);
+ if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+ /* We can take a node from the left neighbor. */
+ ASSERT(l_hdr->bth_core);
+ zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
+
+ /*
+ * Start by shifting the elements and children in the current
+ * node to the right by one spot.
+ */
+ bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
+
+ /*
+ * Move the separator between node and neighbor to the first
+ * element slot in the current node.
+ */
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ bmov(separator, node->btc_elems, size);
+
+ /* Move the last child of neighbor to our first child slot. */
+ zfs_btree_hdr_t **take_child = neighbor->btc_children +
+ l_hdr->bth_count;
+ bmov(take_child, node->btc_children, sizeof (*take_child));
+ node->btc_children[0]->bth_parent = node;
+
+ /* Move the last element of neighbor to the separator spot. */
+ uint8_t *take_elem = neighbor->btc_elems +
+ (l_hdr->bth_count - 1) * size;
+ bmov(take_elem, separator, size);
+ l_hdr->bth_count--;
+ zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+ return;
+ }
+
+ zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+ NULL : parent->btc_children[parent_idx + 1]);
+ if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+ /* We can take a node from the right neighbor. */
+ ASSERT(r_hdr->bth_core);
+ zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
+
+ /*
+ * Shift elements in node left by one spot to overwrite rm_hdr
+ * and the separator before it.
+ */
+ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+ BSS_PARALLELOGRAM);
+
+ /*
+ * Move the separator between node and neighbor to the last
+ * element spot in node.
+ */
+ uint8_t *separator = parent->btc_elems + parent_idx * size;
+ bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size,
+ size);
+
+ /*
+ * Move the first child of neighbor to the last child spot in
+ * node.
+ */
+ zfs_btree_hdr_t **take_child = neighbor->btc_children;
+ bmov(take_child, node->btc_children + hdr->bth_count,
+ sizeof (*take_child));
+ node->btc_children[hdr->bth_count]->bth_parent = node;
+
+ /* Move the first element of neighbor to the separator spot. */
+ uint8_t *take_elem = neighbor->btc_elems;
+ bmov(take_elem, separator, size);
+ r_hdr->bth_count--;
+
+ /*
+ * Shift the elements and children of neighbor to cover the
+ * stolen elements.
+ */
+ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
+ BSS_TRAPEZOID);
+ zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+ return;
+ }
+
+ /*
+ * In this case, neither of our neighbors can spare an element, so we
+ * need to merge with one of them. We prefer the left one,
+ * arabitrarily. Move the separator into the leftmost merging node
+ * (which may be us or the left neighbor), and then move the right
+ * merging node's elements. Once that's done, we go back and delete
+ * the element we're removing. Finally, go into the parent and delete
+ * the right merging node and the separator. This may cause further
+ * merging.
+ */
+ zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
+ uint64_t new_idx = idx;
+ if (l_hdr != NULL) {
+ keep_hdr = l_hdr;
+ new_rm_hdr = hdr;
+ new_idx += keep_hdr->bth_count + 1;
+ } else {
+ ASSERT3P(r_hdr, !=, NULL);
+ keep_hdr = hdr;
+ new_rm_hdr = r_hdr;
+ parent_idx++;
+ }
+
+ ASSERT(keep_hdr->bth_core);
+ ASSERT(new_rm_hdr->bth_core);
+
+ zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
+ zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) {
+ zfs_btree_verify_poison_at(tree, keep_hdr,
+ keep_hdr->bth_count + i);
+ }
+ }
+
+ /* Move the separator into the left node. */
+ uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ bmov(separator, e_out, size);
+ keep_hdr->bth_count++;
+
+ /* Move all our elements and children into the left node. */
+ bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
+ keep_hdr->bth_count, BSS_TRAPEZOID);
+
+ uint64_t old_count = keep_hdr->bth_count;
+
+ /* Update bookkeeping */
+ keep_hdr->bth_count += new_rm_hdr->bth_count;
+ ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
+
+ /*
+ * Shift the element and children to the right of rm_hdr to
+ * the left by one spot.
+ */
+ ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
+ bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
+ BSS_PARALLELOGRAM);
+ keep_hdr->bth_count--;
+
+ /* Reparent all our children to point to the left node. */
+ zfs_btree_hdr_t **new_start = keep->btc_children +
+ old_count - 1;
+ for (int i = 0; i < new_rm_hdr->bth_count + 1; i++)
+ new_start[i]->bth_parent = keep;
+ for (int i = 0; i <= keep_hdr->bth_count; i++) {
+ ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
+ ASSERT3P(keep->btc_children[i], !=, rm_hdr);
+ }
+ zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+ new_rm_hdr->bth_count = 0;
+ zfs_btree_node_destroy(tree, new_rm_hdr);
+ zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+}
+
+/* Remove the element at the specific location. */
+void
+zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+ size_t size = tree->bt_elem_size;
+ zfs_btree_hdr_t *hdr = where->bti_node;
+ uint64_t idx = where->bti_offset;
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / size, 2);
+
+ ASSERT(!where->bti_before);
+ if (tree->bt_bulk != NULL) {
+ /*
+ * Leave bulk insert mode. Note that our index would be
+ * invalid after we correct the tree, so we copy the value
+ * we're planning to remove and find it again after
+ * bulk_finish.
+ */
+ uint8_t *value = zfs_btree_get(tree, where);
+ uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
+ bmov(value, tmp, size);
+ zfs_btree_bulk_finish(tree);
+ VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
+ kmem_free(tmp, size);
+ hdr = where->bti_node;
+ idx = where->bti_offset;
+ }
+
+ tree->bt_num_elems--;
+ /*
+ * If the element happens to be in a core node, we move a leaf node's
+ * element into its place and then remove the leaf node element. This
+ * makes the rebalance logic not need to be recursive both upwards and
+ * downwards.
+ */
+ if (hdr->bth_core) {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
+ void *new_value = zfs_btree_last_helper(tree, left_subtree,
+ where);
+ ASSERT3P(new_value, !=, NULL);
+
+ bmov(new_value, node->btc_elems + idx * size, size);
+
+ hdr = where->bti_node;
+ idx = where->bti_offset;
+ ASSERT(!where->bti_before);
+ }
+
+ /*
+ * First, we'll update the leaf's metadata. Then, we shift any
+ * elements after the idx to the left. After that, we rebalance if
+ * needed.
+ */
+ ASSERT(!hdr->bth_core);
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ ASSERT3U(hdr->bth_count, >, 0);
+
+ uint64_t min_count = (capacity / 2) - 1;
+
+ /*
+ * If we're over the minimum size or this is the root, just overwrite
+ * the value and return.
+ */
+ if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
+ hdr->bth_count--;
+ bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx);
+ if (hdr->bth_parent == NULL) {
+ ASSERT0(tree->bt_height);
+ if (hdr->bth_count == 0) {
+ tree->bt_root = NULL;
+ tree->bt_height--;
+ zfs_btree_node_destroy(tree, &leaf->btl_hdr);
+ }
+ }
+ if (tree->bt_root != NULL)
+ zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+ zfs_btree_verify(tree);
+ return;
+ }
+ ASSERT3U(hdr->bth_count, ==, min_count);
+
+ /*
+ * Now we try to take a node from a sibling. We check left, then
+ * right. If they exist and have more than the minimum number of
+ * elements, we move the separator betweeen us and them to our node
+ * and move their closest element (last for left, first for right) to
+ * the separator. Along the way we need to collapse the gap made by
+ * idx, and (for our right neighbor) the gap made by removing their
+ * first element.
+ *
+ * Note: this logic currently doesn't support taking from a neighbor
+ * that isn't a sibling. This isn't critical functionality, but may be
+ * worth implementing in the future for completeness' sake.
+ */
+ zfs_btree_core_t *parent = hdr->bth_parent;
+ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+ zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+ parent->btc_children[parent_idx - 1]);
+ if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+ /* We can take a node from the left neighbor. */
+ ASSERT(!l_hdr->bth_core);
+
+ /*
+ * Move our elements back by one spot to make room for the
+ * stolen element and overwrite the element being removed.
+ */
+ bt_shift_leaf_right(tree, leaf, 0, idx);
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems +
+ (l_hdr->bth_count - 1) * size;
+ /* Move the separator to our first spot. */
+ bmov(separator, leaf->btl_elems, size);
+
+ /* Move our neighbor's last element to the separator. */
+ bmov(take_elem, separator, size);
+
+ /* Update the bookkeeping. */
+ l_hdr->bth_count--;
+ zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+
+ zfs_btree_verify(tree);
+ return;
+ }
+
+ zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+ NULL : parent->btc_children[parent_idx + 1]);
+ if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+ /* We can take a node from the right neighbor. */
+ ASSERT(!r_hdr->bth_core);
+ zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
+
+ /*
+ * Move our elements after the element being removed forwards
+ * by one spot to make room for the stolen element and
+ * overwrite the element being removed.
+ */
+ bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx -
+ 1);
+
+ uint8_t *separator = parent->btc_elems + parent_idx * size;
+ uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems;
+ /* Move the separator between us to our last spot. */
+ bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size,
+ size);
+
+ /* Move our neighbor's first element to the separator. */
+ bmov(take_elem, separator, size);
+
+ /* Update the bookkeeping. */
+ r_hdr->bth_count--;
+
+ /*
+ * Move our neighbors elements forwards to overwrite the
+ * stolen element.
+ */
+ bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count);
+ zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+ zfs_btree_verify(tree);
+ return;
+ }
+
+ /*
+ * In this case, neither of our neighbors can spare an element, so we
+ * need to merge with one of them. We prefer the left one,
+ * arabitrarily. Move the separator into the leftmost merging node
+ * (which may be us or the left neighbor), and then move the right
+ * merging node's elements. Once that's done, we go back and delete
+ * the element we're removing. Finally, go into the parent and delete
+ * the right merging node and the separator. This may cause further
+ * merging.
+ */
+ zfs_btree_hdr_t *rm_hdr, *keep_hdr;
+ uint64_t new_idx = idx;
+ if (l_hdr != NULL) {
+ keep_hdr = l_hdr;
+ rm_hdr = hdr;
+ new_idx += keep_hdr->bth_count + 1; // 449
+ } else {
+ ASSERT3P(r_hdr, !=, NULL);
+ keep_hdr = hdr;
+ rm_hdr = r_hdr;
+ parent_idx++;
+ }
+
+ ASSERT(!keep_hdr->bth_core);
+ ASSERT(!rm_hdr->bth_core);
+ ASSERT3U(keep_hdr->bth_count, ==, min_count);
+ ASSERT3U(rm_hdr->bth_count, ==, min_count);
+
+ zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr;
+ zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
+
+ if (zfs_btree_verify_intensity >= 5) {
+ for (int i = 0; i < rm_hdr->bth_count + 1; i++) {
+ zfs_btree_verify_poison_at(tree, keep_hdr,
+ keep_hdr->bth_count + i);
+ }
+ }
+ /*
+ * Move the separator into the first open spot in the left
+ * neighbor.
+ */
+ uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size;
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+ size;
+ bmov(separator, out, size);
+ keep_hdr->bth_count++;
+
+ /* Move our elements to the left neighbor. */
+ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep,
+ keep_hdr->bth_count);
+
+ /* Update the bookkeeping. */
+ keep_hdr->bth_count += rm_hdr->bth_count;
+ ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1);
+
+ /* Remove the value from the node */
+ keep_hdr->bth_count--;
+ bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count -
+ new_idx);
+ zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+ rm_hdr->bth_count = 0;
+ zfs_btree_node_destroy(tree, rm_hdr);
+ /* Remove the emptied node from the parent. */
+ zfs_btree_remove_from_node(tree, parent, rm_hdr);
+ zfs_btree_verify(tree);
+}
+
+/* Remove the given value from the tree. */
+void
+zfs_btree_remove(zfs_btree_t *tree, const void *value)
+{
+ zfs_btree_index_t where = {0};
+ VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
+ zfs_btree_remove_from(tree, &where);
+}
+
+/* Return the number of elements in the tree. */
+ulong_t
+zfs_btree_numnodes(zfs_btree_t *tree)
+{
+ return (tree->bt_num_elems);
+}
+
+/*
+ * This function is used to visit all the elements in the tree before
+ * destroying the tree. This allows the calling code to perform any cleanup it
+ * needs to do. This is more efficient than just removing the first element
+ * over and over, because it removes all rebalancing. Once the destroy_nodes()
+ * function has been called, no other btree operations are valid until it
+ * returns NULL, which point the only valid operation is zfs_btree_destroy().
+ *
+ * example:
+ *
+ * zfs_btree_index_t *cookie = NULL;
+ * my_data_t *node;
+ *
+ * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node->ptr);
+ * zfs_btree_destroy(tree);
+ *
+ */
+void *
+zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
+{
+ if (*cookie == NULL) {
+ if (tree->bt_height == -1)
+ return (NULL);
+ *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
+ return (zfs_btree_first(tree, *cookie));
+ }
+
+ void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
+ zfs_btree_node_destroy);
+ if (rval == NULL) {
+ tree->bt_root = NULL;
+ tree->bt_height = -1;
+ tree->bt_num_elems = 0;
+ kmem_free(*cookie, sizeof (**cookie));
+ tree->bt_bulk = NULL;
+ }
+ return (rval);
+}
+
+static void
+zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ if (hdr->bth_core) {
+ zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ zfs_btree_clear_helper(tree, btc->btc_children[i]);
+ }
+ }
+
+ zfs_btree_node_destroy(tree, hdr);
+}
+
+void
+zfs_btree_clear(zfs_btree_t *tree)
+{
+ if (tree->bt_root == NULL) {
+ ASSERT0(tree->bt_num_elems);
+ return;
+ }
+
+ zfs_btree_clear_helper(tree, tree->bt_root);
+ tree->bt_num_elems = 0;
+ tree->bt_root = NULL;
+ tree->bt_num_nodes = 0;
+ tree->bt_height = -1;
+ tree->bt_bulk = NULL;
+}
+
+void
+zfs_btree_destroy(zfs_btree_t *tree)
+{
+ ASSERT0(tree->bt_num_elems);
+ ASSERT3P(tree->bt_root, ==, NULL);
+}
+
+/* Verify that every child of this node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ if (!hdr->bth_core)
+ return;
+
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
+ zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
+ }
+}
+
+/* Verify that every node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers(zfs_btree_t *tree)
+{
+ if (tree->bt_height == -1) {
+ VERIFY3P(tree->bt_root, ==, NULL);
+ return;
+ }
+ VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
+ zfs_btree_verify_pointers_helper(tree, tree->bt_root);
+}
+
+/*
+ * Verify that all the current node and its children satisfy the count
+ * invariants, and return the total count in the subtree rooted in this node.
+ */
+static uint64_t
+zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ if (!hdr->bth_core) {
+ if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) {
+ uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2);
+ VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1);
+ }
+
+ return (hdr->bth_count);
+ } else {
+
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ uint64_t ret = hdr->bth_count;
+ if (tree->bt_root != hdr && tree->bt_bulk == NULL)
+ VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ ret += zfs_btree_verify_counts_helper(tree,
+ node->btc_children[i]);
+ }
+
+ return (ret);
+ }
+}
+
+/*
+ * Verify that all nodes satisfy the invariants and that the total number of
+ * elements is correct.
+ */
+static void
+zfs_btree_verify_counts(zfs_btree_t *tree)
+{
+ EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
+ if (tree->bt_height == -1) {
+ return;
+ }
+ VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
+ tree->bt_num_elems);
+}
+
+/*
+ * Check that the subtree rooted at this node has a uniform height. Returns
+ * the number of nodes under this node, to help verify bt_num_nodes.
+ */
+static uint64_t
+zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ int64_t height)
+{
+ if (!hdr->bth_core) {
+ VERIFY0(height);
+ return (1);
+ }
+
+ VERIFY(hdr->bth_core);
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ uint64_t ret = 1;
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ ret += zfs_btree_verify_height_helper(tree,
+ node->btc_children[i], height - 1);
+ }
+ return (ret);
+}
+
+/*
+ * Check that the tree rooted at this node has a uniform height, and that the
+ * bt_height in the tree is correct.
+ */
+static void
+zfs_btree_verify_height(zfs_btree_t *tree)
+{
+ EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+ if (tree->bt_height == -1) {
+ return;
+ }
+
+ VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
+ tree->bt_height), ==, tree->bt_num_nodes);
+}
+
+/*
+ * Check that the elements in this node are sorted, and that if this is a core
+ * node, the separators are properly between the subtrees they separaate and
+ * that the children also satisfy this requirement.
+ */
+static void
+zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ size_t size = tree->bt_elem_size;
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ for (int i = 1; i < hdr->bth_count; i++) {
+ VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) *
+ size, leaf->btl_elems + i * size), ==, -1);
+ }
+ return;
+ }
+
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ for (int i = 1; i < hdr->bth_count; i++) {
+ VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
+ node->btc_elems + i * size), ==, -1);
+ }
+ for (int i = 0; i < hdr->bth_count; i++) {
+ uint8_t *left_child_last = NULL;
+ zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
+ if (left_child_hdr->bth_core) {
+ zfs_btree_core_t *left_child =
+ (zfs_btree_core_t *)left_child_hdr;
+ left_child_last = left_child->btc_elems +
+ (left_child_hdr->bth_count - 1) * size;
+ } else {
+ zfs_btree_leaf_t *left_child =
+ (zfs_btree_leaf_t *)left_child_hdr;
+ left_child_last = left_child->btl_elems +
+ (left_child_hdr->bth_count - 1) * size;
+ }
+ if (tree->bt_compar(node->btc_elems + i * size,
+ left_child_last) != 1) {
+ panic("btree: compar returned %d (expected 1) at "
+ "%px %d: compar(%px, %px)", tree->bt_compar(
+ node->btc_elems + i * size, left_child_last),
+ (void *)node, i, (void *)(node->btc_elems + i *
+ size), (void *)left_child_last);
+ }
+
+ uint8_t *right_child_first = NULL;
+ zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
+ if (right_child_hdr->bth_core) {
+ zfs_btree_core_t *right_child =
+ (zfs_btree_core_t *)right_child_hdr;
+ right_child_first = right_child->btc_elems;
+ } else {
+ zfs_btree_leaf_t *right_child =
+ (zfs_btree_leaf_t *)right_child_hdr;
+ right_child_first = right_child->btl_elems;
+ }
+ if (tree->bt_compar(node->btc_elems + i * size,
+ right_child_first) != -1) {
+ panic("btree: compar returned %d (expected -1) at "
+ "%px %d: compar(%px, %px)", tree->bt_compar(
+ node->btc_elems + i * size, right_child_first),
+ (void *)node, i, (void *)(node->btc_elems + i *
+ size), (void *)right_child_first);
+ }
+ }
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ zfs_btree_verify_order_helper(tree, node->btc_children[i]);
+ }
+}
+
+/* Check that all elements in the tree are in sorted order. */
+static void
+zfs_btree_verify_order(zfs_btree_t *tree)
+{
+ EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+ if (tree->bt_height == -1) {
+ return;
+ }
+
+ zfs_btree_verify_order_helper(tree, tree->bt_root);
+}
+
+#ifdef ZFS_DEBUG
+/* Check that all unused memory is poisoned correctly. */
+static void
+zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+ size_t size = tree->bt_elem_size;
+ if (!hdr->bth_core) {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ uint8_t val = 0x0f;
+ for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE -
+ sizeof (zfs_btree_hdr_t); i++) {
+ VERIFY3U(leaf->btl_elems[i], ==, val);
+ }
+ } else {
+ zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+ uint8_t val = 0x0f;
+ for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size;
+ i++) {
+ VERIFY3U(node->btc_elems[i], ==, val);
+ }
+
+ for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+ VERIFY3P(node->btc_children[i], ==,
+ (zfs_btree_hdr_t *)BTREE_POISON);
+ }
+
+ for (int i = 0; i <= hdr->bth_count; i++) {
+ zfs_btree_verify_poison_helper(tree,
+ node->btc_children[i]);
+ }
+ }
+}
+#endif
+
+/* Check that unused memory in the tree is still poisoned. */
+static void
+zfs_btree_verify_poison(zfs_btree_t *tree)
+{
+#ifdef ZFS_DEBUG
+ if (tree->bt_height == -1)
+ return;
+ zfs_btree_verify_poison_helper(tree, tree->bt_root);
+#endif
+}
+
+void
+zfs_btree_verify(zfs_btree_t *tree)
+{
+ if (zfs_btree_verify_intensity == 0)
+ return;
+ zfs_btree_verify_height(tree);
+ if (zfs_btree_verify_intensity == 1)
+ return;
+ zfs_btree_verify_pointers(tree);
+ if (zfs_btree_verify_intensity == 2)
+ return;
+ zfs_btree_verify_counts(tree);
+ if (zfs_btree_verify_intensity == 3)
+ return;
+ zfs_btree_verify_order(tree);
+
+ if (zfs_btree_verify_intensity == 4)
+ return;
+ zfs_btree_verify_poison(tree);
+}
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index a0f90496a..5e7b31c76 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -783,7 +783,7 @@ ddt_entry_compare(const void *x1, const void *x2)
break;
}
- return (AVL_ISIGN(cmp));
+ return (TREE_ISIGN(cmp));
}
static ddt_t *
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index de57b170f..f7498854a 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1788,7 +1788,7 @@ userquota_compare(const void *l, const void *r)
*/
rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
- return (AVL_ISIGN(rv));
+ return (TREE_ISIGN(rv));
}
static void
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 6249e165f..48c3705c6 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1209,7 +1209,7 @@ guid_compare(const void *arg1, const void *arg2)
const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
- return (AVL_CMP(gmep1->guid, gmep2->guid));
+ return (TREE_CMP(gmep1->guid, gmep2->guid));
}
static void
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 42116a1c3..ae4f91fc3 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -1329,11 +1329,11 @@ send_range_after(const struct send_range *from, const struct send_range *to)
return (-1);
if (from_obj >= to_end_obj)
return (1);
- int64_t cmp = AVL_CMP(to->type == OBJECT_RANGE, from->type ==
+ int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type ==
OBJECT_RANGE);
if (unlikely(cmp))
return (cmp);
- cmp = AVL_CMP(to->type == OBJECT, from->type == OBJECT);
+ cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT);
if (unlikely(cmp))
return (cmp);
if (from->end_blkid <= to->start_blkid)
@@ -1402,7 +1402,7 @@ send_range_start_compare(struct send_range *r1, struct send_range *r2)
uint64_t r1_l0equiv = r1->start_blkid;
uint64_t r2_objequiv = r2->object;
uint64_t r2_l0equiv = r2->start_blkid;
- int64_t cmp = AVL_CMP(r1->eos_marker, r2->eos_marker);
+ int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker);
if (unlikely(cmp))
return (cmp);
if (r1->object == 0) {
@@ -1414,17 +1414,17 @@ send_range_start_compare(struct send_range *r1, struct send_range *r2)
r2_l0equiv = 0;
}
- cmp = AVL_CMP(r1_objequiv, r2_objequiv);
+ cmp = TREE_CMP(r1_objequiv, r2_objequiv);
if (likely(cmp))
return (cmp);
- cmp = AVL_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE);
+ cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE);
if (unlikely(cmp))
return (cmp);
- cmp = AVL_CMP(r2->type == OBJECT, r1->type == OBJECT);
+ cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT);
if (unlikely(cmp))
return (cmp);
- return (AVL_CMP(r1_l0equiv, r2_l0equiv));
+ return (TREE_CMP(r1_l0equiv, r2_l0equiv));
}
enum q_idx {
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 95132344c..9bca72d53 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -89,11 +89,11 @@ dbuf_compare(const void *x1, const void *x2)
const dmu_buf_impl_t *d1 = x1;
const dmu_buf_impl_t *d2 = x2;
- int cmp = AVL_CMP(d1->db_level, d2->db_level);
+ int cmp = TREE_CMP(d1->db_level, d2->db_level);
if (likely(cmp))
return (cmp);
- cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+ cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
if (likely(cmp))
return (cmp);
@@ -105,7 +105,7 @@ dbuf_compare(const void *x1, const void *x2)
return (1);
}
- return (AVL_PCMP(d1, d2));
+ return (TREE_PCMP(d1, d2));
}
/* ARGSUSED */
@@ -2218,7 +2218,8 @@ done:
{
int txgoff = tx->tx_txg & TXG_MASK;
if (dn->dn_free_ranges[txgoff] == NULL) {
- dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
+ dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
+ RANGE_SEG64, NULL, 0, 0);
}
range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index 2126f3d9b..42c612abc 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -595,16 +595,16 @@ dsl_bookmark_compare(const void *l, const void *r)
const dsl_bookmark_node_t *ldbn = l;
const dsl_bookmark_node_t *rdbn = r;
- int64_t cmp = AVL_CMP(ldbn->dbn_phys.zbm_creation_txg,
+ int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg,
rdbn->dbn_phys.zbm_creation_txg);
if (likely(cmp))
return (cmp);
- cmp = AVL_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN),
+ cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN),
(rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
if (likely(cmp))
return (cmp);
cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name);
- return (AVL_ISIGN(cmp));
+ return (TREE_ISIGN(cmp));
}
/*
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 2e4d18ade..8cb0f90fb 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -118,7 +118,7 @@ dsl_deadlist_compare(const void *arg1, const void *arg2)
const dsl_deadlist_entry_t *dle1 = arg1;
const dsl_deadlist_entry_t *dle2 = arg2;
- return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
+ return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
}
static int
@@ -127,7 +127,7 @@ dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
const dsl_deadlist_cache_entry_t *dlce1 = arg1;
const dsl_deadlist_cache_entry_t *dlce2 = arg2;
- return (AVL_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
+ return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
}
static void
@@ -917,14 +917,14 @@ livelist_compare(const void *larg, const void *rarg)
uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
if (l_dva0_vdev != r_dva0_vdev)
- return (AVL_CMP(l_dva0_vdev, r_dva0_vdev));
+ return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
/* if vdevs are equal, sort by offsets. */
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
if (l_dva0_offset == r_dva0_offset)
ASSERT3U(l->blk_birth, ==, r->blk_birth);
- return (AVL_CMP(l_dva0_offset, r_dva0_offset));
+ return (TREE_CMP(l_dva0_offset, r_dva0_offset));
}
struct livelist_iter_arg {
diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c
index cef460f02..cf8a3c9bb 100644
--- a/module/zfs/dsl_deleg.c
+++ b/module/zfs/dsl_deleg.c
@@ -399,7 +399,7 @@ perm_set_compare(const void *arg1, const void *arg2)
val = strcmp(node1->p_setname, node2->p_setname);
- return (AVL_ISIGN(val));
+ return (TREE_ISIGN(val));
}
/*
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index f1621c9c9..3eacb42f1 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -279,7 +279,7 @@ struct dsl_scan_io_queue {
/* trees used for sorting I/Os and extents of I/Os */
range_tree_t *q_exts_by_addr;
- avl_tree_t q_exts_by_size;
+ zfs_btree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
uint64_t q_sio_memused;
@@ -646,7 +646,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
mutex_enter(&vd->vdev_scan_io_queue_lock);
ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
- ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
+ ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
+ NULL);
ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
mutex_exit(&vd->vdev_scan_io_queue_lock);
}
@@ -1242,7 +1243,7 @@ dsl_scan_should_clear(dsl_scan_t *scn)
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
/* # extents in exts_by_size = # in exts_by_addr */
- mused += avl_numnodes(&queue->q_exts_by_size) *
+ mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
sizeof (range_seg_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
@@ -2847,7 +2848,7 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
srch_sio = sio_alloc(1);
srch_sio->sio_nr_dvas = 1;
- SIO_SET_OFFSET(srch_sio, rs->rs_start);
+ SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
/*
* The exact start of the extent might not contain any matching zios,
@@ -2859,10 +2860,12 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
if (sio == NULL)
sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
- while (sio != NULL &&
- SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
- ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
- ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
+ while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+ queue->q_exts_by_addr) && num_sios <= 32) {
+ ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
+ queue->q_exts_by_addr));
+ ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
+ queue->q_exts_by_addr));
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
@@ -2880,16 +2883,19 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
* in the segment we update it to reflect the work we were able to
* complete. Otherwise, we remove it from the range tree entirely.
*/
- if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
+ if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+ queue->q_exts_by_addr)) {
range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-bytes_issued);
range_tree_resize_segment(queue->q_exts_by_addr, rs,
- SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
+ SIO_GET_OFFSET(sio), rs_get_end(rs,
+ queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
return (B_TRUE);
} else {
- range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
- rs->rs_end - rs->rs_start);
+ uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
+ uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
+ range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
return (B_FALSE);
}
}
@@ -2909,6 +2915,7 @@ static range_seg_t *
scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
{
dsl_scan_t *scn = queue->q_scn;
+ range_tree_t *rt = queue->q_exts_by_addr;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
ASSERT(scn->scn_is_sorted);
@@ -2916,9 +2923,16 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
/* handle tunable overrides */
if (scn->scn_checkpointing || scn->scn_clearing) {
if (zfs_scan_issue_strategy == 1) {
- return (range_tree_first(queue->q_exts_by_addr));
+ return (range_tree_first(rt));
} else if (zfs_scan_issue_strategy == 2) {
- return (avl_first(&queue->q_exts_by_size));
+ range_seg_t *size_rs =
+ zfs_btree_first(&queue->q_exts_by_size, NULL);
+ uint64_t start = rs_get_start(size_rs, rt);
+ uint64_t size = rs_get_end(size_rs, rt) - start;
+ range_seg_t *addr_rs = range_tree_find(rt, start,
+ size);
+ ASSERT3P(addr_rs, !=, NULL);
+ return (addr_rs);
}
}
@@ -2932,9 +2946,15 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
* In this case, we instead switch to issuing extents in LBA order.
*/
if (scn->scn_checkpointing) {
- return (range_tree_first(queue->q_exts_by_addr));
+ return (range_tree_first(rt));
} else if (scn->scn_clearing) {
- return (avl_first(&queue->q_exts_by_size));
+ range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
+ NULL);
+ uint64_t start = rs_get_start(size_rs, rt);
+ uint64_t size = rs_get_end(size_rs, rt) - start;
+ range_seg_t *addr_rs = range_tree_find(rt, start, size);
+ ASSERT3P(addr_rs, !=, NULL);
+ return (addr_rs);
} else {
return (NULL);
}
@@ -4038,9 +4058,10 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
static int
ext_size_compare(const void *x, const void *y)
{
- const range_seg_t *rsa = x, *rsb = y;
- uint64_t sa = rsa->rs_end - rsa->rs_start,
- sb = rsb->rs_end - rsb->rs_start;
+ const range_seg_gap_t *rsa = x, *rsb = y;
+
+ uint64_t sa = rsa->rs_end - rsa->rs_start;
+ uint64_t sb = rsb->rs_end - rsb->rs_start;
uint64_t score_a, score_b;
score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
@@ -4069,7 +4090,7 @@ sio_addr_compare(const void *x, const void *y)
{
const scan_io_t *a = x, *b = y;
- return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
+ return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
}
/* IO queues are created on demand when they are needed. */
@@ -4083,8 +4104,8 @@ scan_io_queue_create(vdev_t *vd)
q->q_vd = vd;
q->q_sio_memused = 0;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
- q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
- &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
+ q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
+ &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
avl_create(&q->q_sios_by_addr, sio_addr_compare,
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index d4dacbc2f..7b8ac91b1 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -36,6 +36,7 @@
#include <sys/zfeature.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/zap.h>
+#include <sys/btree.h>
#define WITH_DF_BLOCK_ALLOCATOR
@@ -184,6 +185,13 @@ int metaslab_df_free_pct = 4;
int metaslab_df_max_search = 16 * 1024 * 1024;
/*
+ * Forces the metaslab_block_picker function to search for at least this many
+ * segments forwards until giving up on finding a segment that the allocation
+ * will fit into.
+ */
+uint32_t metaslab_min_search_count = 100;
+
+/*
* If we are not searching forward (due to metaslab_df_max_search,
* metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
* controls what segment is used. If it is set, we will use the largest free
@@ -277,17 +285,32 @@ uint64_t metaslab_trace_max_entries = 5000;
int max_disabled_ms = 3;
/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
+/*
* Maximum percentage of memory to use on storing loaded metaslabs. If loading
* a metaslab would take it over this percentage, the oldest selected metaslab
* is automatically unloaded.
*/
-int zfs_metaslab_mem_limit = 25;
+int zfs_metaslab_mem_limit = 75;
/*
- * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
- * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ * Force the per-metaslab range trees to use 64-bit integers to store
+ * segments. Used for debugging purposes.
*/
-unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+
+/*
+ * By default we only store segments over a certain size in the size-sorted
+ * metaslab trees (ms_allocatable_by_size and
+ * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
+ * improves load and unload times at the cost of causing us to use slightly
+ * larger segments than we would otherwise in some cases.
+ */
+uint32_t metaslab_by_size_min_shift = 14;
static uint64_t metaslab_weight(metaslab_t *, boolean_t);
static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
@@ -299,8 +322,66 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
static unsigned int metaslab_idx_func(multilist_t *, void *);
static void metaslab_evict(metaslab_t *, uint64_t);
+static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
#ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache;
+
+typedef struct metaslab_stats {
+ kstat_named_t metaslabstat_trace_over_limit;
+ kstat_named_t metaslabstat_df_find_under_floor;
+ kstat_named_t metaslabstat_reload_tree;
+} metaslab_stats_t;
+
+static metaslab_stats_t metaslab_stats = {
+ { "trace_over_limit", KSTAT_DATA_UINT64 },
+ { "df_find_under_floor", KSTAT_DATA_UINT64 },
+ { "reload_tree", KSTAT_DATA_UINT64 },
+};
+
+#define METASLABSTAT_BUMP(stat) \
+ atomic_inc_64(&metaslab_stats.stat.value.ui64);
+
+
+kstat_t *metaslab_ksp;
+
+void
+metaslab_stat_init(void)
+{
+ ASSERT(metaslab_alloc_trace_cache == NULL);
+ metaslab_alloc_trace_cache = kmem_cache_create(
+ "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
+ "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
+ sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (metaslab_ksp != NULL) {
+ metaslab_ksp->ks_data = &metaslab_stats;
+ kstat_install(metaslab_ksp);
+ }
+}
+
+void
+metaslab_stat_fini(void)
+{
+ if (metaslab_ksp != NULL) {
+ kstat_delete(metaslab_ksp);
+ metaslab_ksp = NULL;
+ }
+
+ kmem_cache_destroy(metaslab_alloc_trace_cache);
+ metaslab_alloc_trace_cache = NULL;
+}
+#else
+
+void
+metaslab_stat_init(void)
+{
+}
+
+void
+metaslab_stat_fini(void)
+{
+}
#endif
/*
@@ -608,13 +689,13 @@ metaslab_compare(const void *x1, const void *x2)
if (sort1 > sort2)
return (1);
- int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
+ int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
if (likely(cmp))
return (cmp);
- IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+ IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
- return (AVL_CMP(m1->ms_start, m2->ms_start));
+ return (TREE_CMP(m1->ms_start, m2->ms_start));
}
/*
@@ -711,17 +792,17 @@ metaslab_sort_by_flushed(const void *va, const void *vb)
const metaslab_t *a = va;
const metaslab_t *b = vb;
- int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+ int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
if (likely(cmp))
return (cmp);
uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
- cmp = AVL_CMP(a_vdev_id, b_vdev_id);
+ cmp = TREE_CMP(a_vdev_id, b_vdev_id);
if (cmp)
return (cmp);
- return (AVL_CMP(a->ms_id, b->ms_id));
+ return (TREE_CMP(a->ms_id, b->ms_id));
}
metaslab_group_t *
@@ -1212,25 +1293,170 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
*/
/*
- * Comparison function for the private size-ordered tree. Tree is sorted
- * by size, larger sizes at the end of the tree.
+ * Comparison function for the private size-ordered tree using 32-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize32_compare(const void *x1, const void *x2)
+{
+ const range_seg32_t *r1 = x1;
+ const range_seg32_t *r2 = x2;
+
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+ int cmp = TREE_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
+
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+
+/*
+ * Comparison function for the private size-ordered tree using 64-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
static int
-metaslab_rangesize_compare(const void *x1, const void *x2)
+metaslab_rangesize64_compare(const void *x1, const void *x2)
{
- const range_seg_t *r1 = x1;
- const range_seg_t *r2 = x2;
+ const range_seg64_t *r1 = x1;
+ const range_seg64_t *r2 = x2;
+
uint64_t rs_size1 = r1->rs_end - r1->rs_start;
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
- int cmp = AVL_CMP(rs_size1, rs_size2);
+ int cmp = TREE_CMP(rs_size1, rs_size2);
if (likely(cmp))
return (cmp);
- return (AVL_CMP(r1->rs_start, r2->rs_start));
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+typedef struct metaslab_rt_arg {
+ zfs_btree_t *mra_bt;
+ uint32_t mra_floor_shift;
+} metaslab_rt_arg_t;
+
+struct mssa_arg {
+ range_tree_t *rt;
+ metaslab_rt_arg_t *mra;
+};
+
+static void
+metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
+{
+ struct mssa_arg *mssap = arg;
+ range_tree_t *rt = mssap->rt;
+ metaslab_rt_arg_t *mrap = mssap->mra;
+ range_seg_max_t seg = {0};
+ rs_set_start(&seg, rt, start);
+ rs_set_end(&seg, rt, start + size);
+ metaslab_rt_add(rt, &seg, mrap);
+}
+
+static void
+metaslab_size_tree_full_load(range_tree_t *rt)
+{
+ metaslab_rt_arg_t *mrap = rt->rt_arg;
+#ifdef _METASLAB_TRACING
+ METASLABSTAT_BUMP(metaslabstat_reload_tree);
+#endif
+ ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
+ mrap->mra_floor_shift = 0;
+ struct mssa_arg arg = {0};
+ arg.rt = rt;
+ arg.mra = mrap;
+ range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
/*
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
+ */
+/* ARGSUSED */
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ size_t size;
+ int (*compare) (const void *, const void *);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ compare = metaslab_rangesize32_compare;
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ compare = metaslab_rangesize64_compare;
+ break;
+ default:
+ panic("Invalid range seg type %d", rt->rt_type);
+ }
+ zfs_btree_create(size_tree, compare, size);
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ zfs_btree_destroy(size_tree);
+ kmem_free(mrap, sizeof (*mrap));
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
+ (1 << mrap->mra_floor_shift))
+ return;
+
+ zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+ mrap->mra_floor_shift))
+ return;
+
+ zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
+{
+ metaslab_rt_arg_t *mrap = arg;
+ zfs_btree_t *size_tree = mrap->mra_bt;
+ zfs_btree_clear(size_tree);
+ zfs_btree_destroy(size_tree);
+
+ metaslab_rt_create(rt, arg);
+}
+
+static range_tree_ops_t metaslab_rt_ops = {
+ .rtop_create = metaslab_rt_create,
+ .rtop_destroy = metaslab_rt_destroy,
+ .rtop_add = metaslab_rt_add,
+ .rtop_remove = metaslab_rt_remove,
+ .rtop_vacate = metaslab_rt_vacate
+};
+
+/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
@@ -1242,16 +1468,20 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
uint64_t
metaslab_largest_allocatable(metaslab_t *msp)
{
- avl_tree_t *t = &msp->ms_allocatable_by_size;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
range_seg_t *rs;
if (t == NULL)
return (0);
- rs = avl_last(t);
+ if (zfs_btree_numnodes(t) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+
+ rs = zfs_btree_last(t, NULL);
if (rs == NULL)
return (0);
- return (rs->rs_end - rs->rs_start);
+ return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
+ msp->ms_allocatable));
}
/*
@@ -1266,7 +1496,10 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
if (msp->ms_unflushed_frees == NULL)
return (0);
- range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+ if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
+ metaslab_size_tree_full_load(msp->ms_unflushed_frees);
+ range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
+ NULL);
if (rs == NULL)
return (0);
@@ -1293,8 +1526,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
* the largest segment; there may be other usable chunks in the
* largest segment, but we ignore them.
*/
- uint64_t rstart = rs->rs_start;
- uint64_t rsize = rs->rs_end - rstart;
+ uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
+ uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
uint64_t start = 0;
uint64_t size = 0;
@@ -1318,17 +1551,18 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
}
static range_seg_t *
-metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
+metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
+ uint64_t size, zfs_btree_index_t *where)
{
- range_seg_t *rs, rsearch;
- avl_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch;
- rsearch.rs_start = start;
- rsearch.rs_end = start + size;
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, start + size);
- rs = avl_find(t, &rsearch, &where);
+ rs = zfs_btree_find(t, &rsearch, where);
if (rs == NULL) {
- rs = avl_nearest(t, where, AVL_AFTER);
+ rs = zfs_btree_next(t, where, where);
}
return (rs);
@@ -1337,27 +1571,34 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR)
/*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * This is a helper function that can be used by the allocator to find a
+ * suitable block to allocate. This will search the specified B-tree looking
+ * for a block that matches the specified criteria.
*/
static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
uint64_t max_search)
{
- range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+ if (*cursor == 0)
+ *cursor = rt->rt_start;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t where;
+ range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
uint64_t first_found;
+ int count_searched = 0;
if (rs != NULL)
- first_found = rs->rs_start;
+ first_found = rs_get_start(rs, rt);
- while (rs != NULL && rs->rs_start - first_found <= max_search) {
- uint64_t offset = rs->rs_start;
- if (offset + size <= rs->rs_end) {
+ while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
+ max_search || count_searched < metaslab_min_search_count)) {
+ uint64_t offset = rs_get_start(rs, rt);
+ if (offset + size <= rs_get_end(rs, rt)) {
*cursor = offset + size;
return (offset);
}
- rs = AVL_NEXT(t, rs);
+ rs = zfs_btree_next(bt, &where, &where);
+ count_searched++;
}
*cursor = 0;
@@ -1403,8 +1644,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
uint64_t offset;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(&rt->rt_root), ==,
- avl_numnodes(&msp->ms_allocatable_by_size));
/*
* If we're running low on space, find a segment based on size,
@@ -1414,22 +1653,33 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
free_pct < metaslab_df_free_pct) {
offset = -1;
} else {
- offset = metaslab_block_picker(&rt->rt_root,
+ offset = metaslab_block_picker(rt,
cursor, size, metaslab_df_max_search);
}
if (offset == -1) {
range_seg_t *rs;
+ if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
if (metaslab_df_use_largest_segment) {
/* use largest free segment */
- rs = avl_last(&msp->ms_allocatable_by_size);
+ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
} else {
+ zfs_btree_index_t where;
/* use segment of this size, or next largest */
+#ifdef _METASLAB_TRACING
+ metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
+ if (size < (1 << mrap->mra_floor_shift)) {
+ METASLABSTAT_BUMP(
+ metaslabstat_df_find_under_floor);
+ }
+#endif
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
- 0, size);
+ rt, msp->ms_start, size, &where);
}
- if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
- offset = rs->rs_start;
+ if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
+ rt)) {
+ offset = rs_get_start(rs, rt);
*cursor = offset + size;
}
}
@@ -1458,25 +1708,27 @@ static uint64_t
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
range_tree_t *rt = msp->ms_allocatable;
- avl_tree_t *t = &msp->ms_allocatable_by_size;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
uint64_t *cursor = &msp->ms_lbas[0];
uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
ASSERT3U(*cursor_end, >=, *cursor);
if ((*cursor + size) > *cursor_end) {
range_seg_t *rs;
- rs = avl_last(&msp->ms_allocatable_by_size);
- if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+ if (zfs_btree_numnodes(t) == 0)
+ metaslab_size_tree_full_load(msp->ms_allocatable);
+ rs = zfs_btree_last(t, NULL);
+ if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
+ size)
return (-1ULL);
- *cursor = rs->rs_start;
- *cursor_end = rs->rs_end;
+ *cursor = rs_get_start(rs, rt);
+ *cursor_end = rs_get_end(rs, rt);
}
offset = *cursor;
@@ -1511,39 +1763,40 @@ uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &msp->ms_allocatable->rt_root;
- avl_index_t where;
- range_seg_t *rs, rsearch;
+ zfs_btree_t *t = &msp->ms_allocatable->rt_root;
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch;
uint64_t hbit = highbit64(size);
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
uint64_t max_size = metaslab_largest_allocatable(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==,
- avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
- rsearch.rs_start = *cursor;
- rsearch.rs_end = *cursor + size;
+ rs_set_start(&rsearch, rt, *cursor);
+ rs_set_end(&rsearch, rt, *cursor + size);
- rs = avl_find(t, &rsearch, &where);
- if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+ rs = zfs_btree_find(t, &rsearch, &where);
+ if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
t = &msp->ms_allocatable_by_size;
- rsearch.rs_start = 0;
- rsearch.rs_end = MIN(max_size,
- 1ULL << (hbit + metaslab_ndf_clump_shift));
- rs = avl_find(t, &rsearch, &where);
+ rs_set_start(&rsearch, rt, 0);
+ rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
+ metaslab_ndf_clump_shift)));
+
+ rs = zfs_btree_find(t, &rsearch, &where);
if (rs == NULL)
- rs = avl_nearest(t, where, AVL_AFTER);
+ rs = zfs_btree_next(t, &where, &where);
ASSERT(rs != NULL);
}
- if ((rs->rs_end - rs->rs_start) >= size) {
- *cursor = rs->rs_start + size;
- return (rs->rs_start);
+ if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
+ *cursor = rs_get_start(rs, rt) + size;
+ return (rs_get_start(rs, rt));
}
return (-1ULL);
}
@@ -1889,9 +2142,8 @@ metaslab_potentially_evict(metaslab_class_t *mc)
{
#ifdef _KERNEL
uint64_t allmem = arc_all_memory();
- extern kmem_cache_t *range_seg_cache;
- uint64_t inuse = range_seg_cache->skc_obj_total;
- uint64_t size = range_seg_cache->skc_obj_size;
+ uint64_t inuse = zfs_btree_leaf_cache->skc_obj_total;
+ uint64_t size = zfs_btree_leaf_cache->skc_obj_size;
int tries = 0;
for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
@@ -1928,7 +2180,7 @@ metaslab_potentially_evict(metaslab_class_t *mc)
*/
if (msp->ms_loading) {
msp = next_msp;
- inuse = range_seg_cache->skc_obj_total;
+ inuse = zfs_btree_leaf_cache->skc_obj_total;
continue;
}
/*
@@ -1950,7 +2202,7 @@ metaslab_potentially_evict(metaslab_class_t *mc)
}
mutex_exit(&msp->ms_lock);
msp = next_msp;
- inuse = range_seg_cache->skc_obj_total;
+ inuse = zfs_btree_leaf_cache->skc_obj_total;
}
}
#endif
@@ -1993,11 +2245,40 @@ metaslab_load_impl(metaslab_t *msp)
mutex_exit(&msp->ms_lock);
hrtime_t load_start = gethrtime();
+ metaslab_rt_arg_t *mrap;
+ if (msp->ms_allocatable->rt_arg == NULL) {
+ mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ } else {
+ mrap = msp->ms_allocatable->rt_arg;
+ msp->ms_allocatable->rt_ops = NULL;
+ msp->ms_allocatable->rt_arg = NULL;
+ }
+ mrap->mra_bt = &msp->ms_allocatable_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+
if (msp->ms_sm != NULL) {
error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
SM_FREE, length);
+
+ /* Now, populate the size-sorted tree. */
+ metaslab_rt_create(msp->ms_allocatable, mrap);
+ msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+ msp->ms_allocatable->rt_arg = mrap;
+
+ struct mssa_arg arg = {0};
+ arg.rt = msp->ms_allocatable;
+ arg.mra = mrap;
+ range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
+ &arg);
} else {
/*
+ * Add the size-sorted tree first, since we don't need to load
+ * the metaslab from the spacemap.
+ */
+ metaslab_rt_create(msp->ms_allocatable, mrap);
+ msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+ msp->ms_allocatable->rt_arg = mrap;
+ /*
* The space map has not been allocated yet, so treat
* all the space in the metaslab as free and add it to the
* ms_allocatable tree.
@@ -2252,6 +2533,29 @@ metaslab_unload(metaslab_t *msp)
metaslab_recalculate_weight_and_sort(msp);
}
+/*
+ * We want to optimize the memory use of the per-metaslab range
+ * trees. To do this, we store the segments in the range trees in
+ * units of sectors, zero-indexing from the start of the metaslab. If
+ * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
+ * the ranges using two uint32_ts, rather than two uint64_ts.
+ */
+static range_seg_type_t
+metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
+ uint64_t *start, uint64_t *shift)
+{
+ if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
+ !zfs_metaslab_force_large_segs) {
+ *shift = vdev->vdev_ashift;
+ *start = msp->ms_start;
+ return (RANGE_SEG32);
+ } else {
+ *shift = 0;
+ *start = 0;
+ return (RANGE_SEG64);
+ }
+}
+
void
metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
{
@@ -2327,6 +2631,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
+
/*
* We create the ms_allocatable here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
@@ -2335,10 +2643,9 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
* we'd data fault on any attempt to use this metaslab before
* it's ready.
*/
- ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
- &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
+ ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
- ms->ms_trim = range_tree_create(NULL, NULL);
+ ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms, B_FALSE);
@@ -2393,7 +2700,7 @@ metaslab_unflushed_changes_memused(metaslab_t *ms)
{
return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
range_tree_numsegs(ms->ms_unflushed_frees)) *
- sizeof (range_seg_t));
+ ms->ms_unflushed_allocs->rt_root.bt_elem_size);
}
void
@@ -3187,7 +3494,7 @@ metaslab_should_condense(metaslab_t *msp)
* We always condense metaslabs that are empty and metaslabs for
* which a condense request has been made.
*/
- if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+ if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
msp->ms_condense_wanted)
return (B_TRUE);
@@ -3233,28 +3540,29 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
* So to truncate the space map to represent all the entries of
* previous TXGs we do the following:
*
- * 1] We create a range tree (condense tree) that is 100% allocated.
- * 2] We remove from it all segments found in the ms_defer trees
+ * 1] We create a range tree (condense tree) that is 100% empty.
+ * 2] We add to it all segments found in the ms_defer trees
* as those segments are marked as free in the original space
* map. We do the same with the ms_allocating trees for the same
- * reason. Removing these segments should be a relatively
+ * reason. Adding these segments should be a relatively
* inexpensive operation since we expect these trees to have a
* small number of nodes.
- * 3] We vacate any unflushed allocs as they should already exist
- * in the condense tree. Then we vacate any unflushed frees as
- * they should already be part of ms_allocatable.
- * 4] At this point, we would ideally like to remove all segments
+ * 3] We vacate any unflushed allocs, since they are not frees we
+ * need to add to the condense tree. Then we vacate any
+ * unflushed frees as they should already be part of ms_allocatable.
+ * 4] At this point, we would ideally like to add all segments
* in the ms_allocatable tree from the condense tree. This way
* we would write all the entries of the condense tree as the
- * condensed space map, which would only contain allocated
- * segments with everything else assumed to be freed.
+ * condensed space map, which would only contain freeed
+ * segments with everything else assumed to be allocated.
*
* Doing so can be prohibitively expensive as ms_allocatable can
- * be large, and therefore computationally expensive to subtract
- * from the condense_tree. Instead we first sync out the
- * condense_tree and then the ms_allocatable, in the condensed
- * space map. While this is not optimal, it is typically close to
- * optimal and more importantly much cheaper to compute.
+ * be large, and therefore computationally expensive to add to
+ * the condense_tree. Instead we first sync out an entry marking
+ * everything as allocated, then the condense_tree and then the
+ * ms_allocatable, in the condensed space map. While this is not
+ * optimal, it is typically close to optimal and more importantly
+ * much cheaper to compute.
*
* 5] Finally, as both of the unflushed trees were written to our
* new and condensed metaslab space map, we basically flushed
@@ -3268,22 +3576,26 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
spa->spa_name, space_map_length(msp->ms_sm),
- avl_numnodes(&msp->ms_allocatable->rt_root),
+ range_tree_numsegs(msp->ms_allocatable),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
- condense_tree = range_tree_create(NULL, NULL);
- range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
+ &start, &shift);
+
+ condense_tree = range_tree_create(NULL, type, NULL, start, shift);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
- range_tree_remove, condense_tree);
+ range_tree_add, condense_tree);
}
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
- range_tree_remove, condense_tree);
+ range_tree_add, condense_tree);
}
ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
@@ -3331,11 +3643,17 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
* followed by FREES (due to space_map_write() in metaslab_sync()) for
* sync pass 1.
*/
- space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
+ shift);
+ range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
+ space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+ space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
+ range_tree_vacate(tmp_tree, NULL, NULL);
+ range_tree_destroy(tmp_tree);
mutex_enter(&msp->ms_lock);
msp->ms_condensing = B_FALSE;
@@ -3578,7 +3896,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
return;
- VERIFY(txg <= spa_final_dirty_txg(spa));
+ VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
/*
* The only state that can actually be changing concurrently
@@ -3867,32 +4185,46 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* range trees and add its capacity to the vdev.
*/
if (msp->ms_freed == NULL) {
+ range_seg_type_t type;
+ uint64_t shift, start;
+ type = metaslab_calculate_range_tree_type(vd, msp, &start,
+ &shift);
+
for (int t = 0; t < TXG_SIZE; t++) {
ASSERT(msp->ms_allocating[t] == NULL);
- msp->ms_allocating[t] = range_tree_create(NULL, NULL);
+ msp->ms_allocating[t] = range_tree_create(NULL, type,
+ NULL, start, shift);
}
ASSERT3P(msp->ms_freeing, ==, NULL);
- msp->ms_freeing = range_tree_create(NULL, NULL);
+ msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
+ shift);
ASSERT3P(msp->ms_freed, ==, NULL);
- msp->ms_freed = range_tree_create(NULL, NULL);
+ msp->ms_freed = range_tree_create(NULL, type, NULL, start,
+ shift);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
ASSERT3P(msp->ms_defer[t], ==, NULL);
- msp->ms_defer[t] = range_tree_create(NULL, NULL);
+ msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
+ start, shift);
}
ASSERT3P(msp->ms_checkpointing, ==, NULL);
- msp->ms_checkpointing = range_tree_create(NULL, NULL);
+ msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
+ start, shift);
ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
- msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
+ msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
+ start, shift);
+
+ metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
- msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
- &msp->ms_unflushed_frees_by_size,
- metaslab_rangesize_compare, 0);
+ msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
+ type, mrap, start, shift);
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
@@ -4052,36 +4384,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva)
* ==========================================================================
*/
#ifdef _METASLAB_TRACING
-kstat_t *metaslab_trace_ksp;
-kstat_named_t metaslab_trace_over_limit;
-
-void
-metaslab_alloc_trace_init(void)
-{
- ASSERT(metaslab_alloc_trace_cache == NULL);
- metaslab_alloc_trace_cache = kmem_cache_create(
- "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
- metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
- "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
- if (metaslab_trace_ksp != NULL) {
- metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
- kstat_named_init(&metaslab_trace_over_limit,
- "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
- kstat_install(metaslab_trace_ksp);
- }
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
- if (metaslab_trace_ksp != NULL) {
- kstat_delete(metaslab_trace_ksp);
- metaslab_trace_ksp = NULL;
- }
- kmem_cache_destroy(metaslab_alloc_trace_cache);
- metaslab_alloc_trace_cache = NULL;
-}
/*
* Add an allocation trace element to the allocation tracing list.
@@ -4108,7 +4410,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
#ifdef DEBUG
panic("too many entries in allocation list");
#endif
- atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
+ METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
zal->zal_size--;
mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
list_remove(&zal->zal_list, mat_next);
@@ -4161,16 +4463,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
void
-metaslab_alloc_trace_init(void)
-{
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
-}
-
-void
metaslab_trace_init(zio_alloc_list_t *zal)
{
}
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 0e1297214..0b369a438 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -74,42 +74,38 @@
* support removing complete segments.
*/
-kmem_cache_t *range_seg_cache;
-
-/* Generic ops for managing an AVL tree alongside a range tree */
-struct range_tree_ops rt_avl_ops = {
- .rtop_create = rt_avl_create,
- .rtop_destroy = rt_avl_destroy,
- .rtop_add = rt_avl_add,
- .rtop_remove = rt_avl_remove,
- .rtop_vacate = rt_avl_vacate,
-};
-
-void
-range_tree_init(void)
-{
- ASSERT(range_seg_cache == NULL);
- range_seg_cache = kmem_cache_create("range_seg_cache",
- sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-range_tree_fini(void)
-{
- kmem_cache_destroy(range_seg_cache);
- range_seg_cache = NULL;
+static inline void
+rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ size_t size = 0;
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ break;
+ case RANGE_SEG_GAP:
+ size = sizeof (range_seg_gap_t);
+ break;
+ default:
+ VERIFY(0);
+ }
+ bcopy(src, dest, size);
}
void
range_tree_stat_verify(range_tree_t *rt)
{
range_seg_t *rs;
+ zfs_btree_index_t where;
uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
int i;
- for (rs = avl_first(&rt->rt_root); rs != NULL;
- rs = AVL_NEXT(&rt->rt_root, rs)) {
- uint64_t size = rs->rs_end - rs->rs_start;
+ for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
+ rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
int idx = highbit64(size) - 1;
hist[idx]++;
@@ -128,7 +124,7 @@ range_tree_stat_verify(range_tree_t *rt)
static void
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
{
- uint64_t size = rs->rs_end - rs->rs_start;
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
int idx = highbit64(size) - 1;
ASSERT(size != 0);
@@ -142,7 +138,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
static void
range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
{
- uint64_t size = rs->rs_end - rs->rs_start;
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
int idx = highbit64(size) - 1;
ASSERT(size != 0);
@@ -153,14 +149,35 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
rt->rt_histogram[idx]--;
}
-/*
- * NOTE: caller is responsible for all locking.
- */
static int
-range_tree_seg_compare(const void *x1, const void *x2)
+range_tree_seg32_compare(const void *x1, const void *x2)
+{
+ const range_seg32_t *r1 = x1;
+ const range_seg32_t *r2 = x2;
+
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg64_compare(const void *x1, const void *x2)
{
- const range_seg_t *r1 = (const range_seg_t *)x1;
- const range_seg_t *r2 = (const range_seg_t *)x2;
+ const range_seg64_t *r1 = x1;
+ const range_seg64_t *r2 = x2;
+
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg_gap_compare(const void *x1, const void *x2)
+{
+ const range_seg_gap_t *r1 = x1;
+ const range_seg_gap_t *r2 = x2;
ASSERT3U(r1->rs_start, <=, r1->rs_end);
ASSERT3U(r2->rs_start, <=, r2->rs_end);
@@ -169,18 +186,42 @@ range_tree_seg_compare(const void *x1, const void *x2)
}
range_tree_t *
-range_tree_create_impl(range_tree_ops_t *ops, void *arg,
- int (*avl_compare) (const void *, const void *), uint64_t gap)
+range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
+ uint64_t start, uint64_t shift,
+ int (*zfs_btree_compare) (const void *, const void *),
+ uint64_t gap)
{
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
- avl_create(&rt->rt_root, range_tree_seg_compare,
- sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
+ ASSERT3U(shift, <, 64);
+ ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
+ size_t size;
+ int (*compare) (const void *, const void *);
+ switch (type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ compare = range_tree_seg32_compare;
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ compare = range_tree_seg64_compare;
+ break;
+ case RANGE_SEG_GAP:
+ size = sizeof (range_seg_gap_t);
+ compare = range_tree_seg_gap_compare;
+ break;
+ default:
+ panic("Invalid range seg type %d", type);
+ }
+ zfs_btree_create(&rt->rt_root, compare, size);
rt->rt_ops = ops;
rt->rt_gap = gap;
rt->rt_arg = arg;
- rt->rt_avl_compare = avl_compare;
+ rt->rt_type = type;
+ rt->rt_start = start;
+ rt->rt_shift = shift;
+ rt->rt_btree_compare = zfs_btree_compare;
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
@@ -189,9 +230,10 @@ range_tree_create_impl(range_tree_ops_t *ops, void *arg,
}
range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg)
+range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+ void *arg, uint64_t start, uint64_t shift)
{
- return (range_tree_create_impl(ops, arg, NULL, 0));
+ return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
}
void
@@ -202,19 +244,20 @@ range_tree_destroy(range_tree_t *rt)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
- avl_destroy(&rt->rt_root);
+ zfs_btree_destroy(&rt->rt_root);
kmem_free(rt, sizeof (*rt));
}
void
range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
{
- ASSERT3U(rs->rs_fill + delta, !=, 0);
- ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+ ASSERT3U(rs_get_fill(rs, rt) + delta, !=, 0);
+ ASSERT3U(rs_get_fill(rs, rt) + delta, <=, rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
- rs->rs_fill += delta;
+ rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
@@ -223,28 +266,20 @@ static void
range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
{
range_tree_t *rt = arg;
- avl_index_t where;
- range_seg_t rsearch, *rs_before, *rs_after, *rs;
+ zfs_btree_index_t where;
+ range_seg_t *rs_before, *rs_after, *rs;
+ range_seg_max_t tmp, rsearch;
uint64_t end = start + size, gap = rt->rt_gap;
uint64_t bridge_size = 0;
boolean_t merge_before, merge_after;
ASSERT3U(size, !=, 0);
ASSERT3U(fill, <=, size);
+ ASSERT3U(start + size, >, start);
- rsearch.rs_start = start;
- rsearch.rs_end = end;
- rs = avl_find(&rt->rt_root, &rsearch, &where);
-
- if (gap == 0 && rs != NULL &&
- rs->rs_start <= start && rs->rs_end >= end) {
- zfs_panic_recover("zfs: allocating allocated segment"
- "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
- (longlong_t)start, (longlong_t)size,
- (longlong_t)rs->rs_start,
- (longlong_t)rs->rs_end - rs->rs_start);
- return;
- }
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, end);
+ rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
/*
* If this is a gap-supporting range tree, it is possible that we
@@ -255,27 +290,28 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
* the normal code paths.
*/
if (rs != NULL) {
+ ASSERT3U(rt->rt_gap, !=, 0);
+ uint64_t rstart = rs_get_start(rs, rt);
+ uint64_t rend = rs_get_end(rs, rt);
ASSERT3U(gap, !=, 0);
- if (rs->rs_start <= start && rs->rs_end >= end) {
+ if (rstart <= start && rend >= end) {
range_tree_adjust_fill(rt, rs, fill);
return;
}
- avl_remove(&rt->rt_root, rs);
+ zfs_btree_remove(&rt->rt_root, rs);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
range_tree_stat_decr(rt, rs);
- rt->rt_space -= rs->rs_end - rs->rs_start;
+ rt->rt_space -= rend - rstart;
- fill += rs->rs_fill;
- start = MIN(start, rs->rs_start);
- end = MAX(end, rs->rs_end);
+ fill += rs_get_fill(rs, rt);
+ start = MIN(start, rstart);
+ end = MAX(end, rend);
size = end - start;
range_tree_add_impl(rt, start, size, fill);
-
- kmem_cache_free(range_seg_cache, rs);
return;
}
@@ -286,19 +322,21 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
* If gap != 0, we might need to merge with our neighbors even if we
* aren't directly touching.
*/
- rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
- rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+ zfs_btree_index_t where_before, where_after;
+ rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
+ rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
- merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
- merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+ merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >=
+ start - gap);
+ merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end +
+ gap);
if (merge_before && gap != 0)
- bridge_size += start - rs_before->rs_end;
+ bridge_size += start - rs_get_end(rs_before, rt);
if (merge_after && gap != 0)
- bridge_size += rs_after->rs_start - end;
+ bridge_size += rs_get_start(rs_after, rt) - end;
if (merge_before && merge_after) {
- avl_remove(&rt->rt_root, rs_before);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
@@ -307,9 +345,19 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
range_tree_stat_decr(rt, rs_before);
range_tree_stat_decr(rt, rs_after);
- rs_after->rs_fill += rs_before->rs_fill + fill;
- rs_after->rs_start = rs_before->rs_start;
- kmem_cache_free(range_seg_cache, rs_before);
+ rs_copy(rs_after, &tmp, rt);
+ uint64_t before_start = rs_get_start_raw(rs_before, rt);
+ uint64_t before_fill = rs_get_fill(rs_before, rt);
+ uint64_t after_fill = rs_get_fill(rs_after, rt);
+ zfs_btree_remove_from(&rt->rt_root, &where_before);
+
+ /*
+ * We have to re-find the node because our old reference is
+ * invalid as soon as we do any mutating btree operations.
+ */
+ rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
+ rs_set_start_raw(rs_after, rt, before_start);
+ rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
rs = rs_after;
} else if (merge_before) {
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
@@ -317,8 +365,9 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
range_tree_stat_decr(rt, rs_before);
- rs_before->rs_fill += fill;
- rs_before->rs_end = end;
+ uint64_t before_fill = rs_get_fill(rs_before, rt);
+ rs_set_end(rs_before, rt, end);
+ rs_set_fill(rs_before, rt, before_fill + fill);
rs = rs_before;
} else if (merge_after) {
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
@@ -326,22 +375,26 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
range_tree_stat_decr(rt, rs_after);
- rs_after->rs_fill += fill;
- rs_after->rs_start = start;
+ uint64_t after_fill = rs_get_fill(rs_after, rt);
+ rs_set_start(rs_after, rt, start);
+ rs_set_fill(rs_after, rt, after_fill + fill);
rs = rs_after;
} else {
- rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+ rs = &tmp;
- rs->rs_fill = fill;
- rs->rs_start = start;
- rs->rs_end = end;
- avl_insert(&rt->rt_root, rs, where);
+ rs_set_start(rs, rt, start);
+ rs_set_end(rs, rt, end);
+ rs_set_fill(rs, rt, fill);
+ zfs_btree_insert(&rt->rt_root, rs, &where);
}
- if (gap != 0)
- ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
- else
- ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+ if (gap != 0) {
+ ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ } else {
+ ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ }
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
@@ -360,22 +413,25 @@ static void
range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
boolean_t do_fill)
{
- avl_index_t where;
- range_seg_t rsearch, *rs, *newseg;
+ zfs_btree_index_t where;
+ range_seg_t *rs;
+ range_seg_max_t rsearch, rs_tmp;
uint64_t end = start + size;
boolean_t left_over, right_over;
VERIFY3U(size, !=, 0);
VERIFY3U(size, <=, rt->rt_space);
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
- rsearch.rs_start = start;
- rsearch.rs_end = end;
- rs = avl_find(&rt->rt_root, &rsearch, &where);
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, end);
+ rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
/* Make sure we completely overlap with someone */
if (rs == NULL) {
- zfs_panic_recover("zfs: freeing free segment "
- "(offset=%llu size=%llu)",
+ zfs_panic_recover("zfs: removing nonexistent segment from "
+ "range tree (offset=%llu size=%llu)",
(longlong_t)start, (longlong_t)size);
return;
}
@@ -388,30 +444,32 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
*/
if (rt->rt_gap != 0) {
if (do_fill) {
- if (rs->rs_fill == size) {
- start = rs->rs_start;
- end = rs->rs_end;
+ if (rs_get_fill(rs, rt) == size) {
+ start = rs_get_start(rs, rt);
+ end = rs_get_end(rs, rt);
size = end - start;
} else {
range_tree_adjust_fill(rt, rs, -size);
return;
}
- } else if (rs->rs_start != start || rs->rs_end != end) {
+ } else if (rs_get_start(rs, rt) != start ||
+ rs_get_end(rs, rt) != end) {
zfs_panic_recover("zfs: freeing partial segment of "
"gap tree (offset=%llu size=%llu) of "
"(offset=%llu size=%llu)",
(longlong_t)start, (longlong_t)size,
- (longlong_t)rs->rs_start,
- (longlong_t)rs->rs_end - rs->rs_start);
+ (longlong_t)rs_get_start(rs, rt),
+ (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs,
+ rt));
return;
}
}
- VERIFY3U(rs->rs_start, <=, start);
- VERIFY3U(rs->rs_end, >=, end);
+ VERIFY3U(rs_get_start(rs, rt), <=, start);
+ VERIFY3U(rs_get_end(rs, rt), >=, end);
- left_over = (rs->rs_start != start);
- right_over = (rs->rs_end != end);
+ left_over = (rs_get_start(rs, rt) != start);
+ right_over = (rs_get_end(rs, rt) != end);
range_tree_stat_decr(rt, rs);
@@ -419,24 +477,33 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
if (left_over && right_over) {
- newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
- newseg->rs_start = end;
- newseg->rs_end = rs->rs_end;
- newseg->rs_fill = newseg->rs_end - newseg->rs_start;
- range_tree_stat_incr(rt, newseg);
+ range_seg_max_t newseg;
+ rs_set_start(&newseg, rt, end);
+ rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt));
+ rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end);
+ range_tree_stat_incr(rt, &newseg);
- rs->rs_end = start;
+ // This modifies the buffer already inside the range tree
+ rs_set_end(rs, rt, start);
+
+ rs_copy(rs, &rs_tmp, rt);
+ if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
+ zfs_btree_insert(&rt->rt_root, &newseg, &where);
+ else
+ zfs_btree_add(&rt->rt_root, &newseg);
- avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
- rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
+ rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
} else if (left_over) {
- rs->rs_end = start;
+ // This modifies the buffer already inside the range tree
+ rs_set_end(rs, rt, start);
+ rs_copy(rs, &rs_tmp, rt);
} else if (right_over) {
- rs->rs_start = end;
+ // This modifies the buffer already inside the range tree
+ rs_set_start(rs, rt, end);
+ rs_copy(rs, &rs_tmp, rt);
} else {
- avl_remove(&rt->rt_root, rs);
- kmem_cache_free(range_seg_cache, rs);
+ zfs_btree_remove_from(&rt->rt_root, &where);
rs = NULL;
}
@@ -446,11 +513,12 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
* the size, since we do not support removing partial segments
* of range trees with gaps.
*/
- rs->rs_fill = rs->rs_end - rs->rs_start;
- range_tree_stat_incr(rt, rs);
+ rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) -
+ rs_get_start_raw(rs, rt));
+ range_tree_stat_incr(rt, &rs_tmp);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
- rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+ rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
}
rt->rt_space -= size;
@@ -472,14 +540,14 @@ void
range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize)
{
- int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+ int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt));
range_tree_stat_decr(rt, rs);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
- rs->rs_start = newstart;
- rs->rs_end = newstart + newsize;
+ rs_set_start(rs, rt, newstart);
+ rs_set_end(rs, rt, newstart + newsize);
range_tree_stat_incr(rt, rs);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
@@ -491,22 +559,27 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
- range_seg_t rsearch;
+ range_seg_max_t rsearch;
uint64_t end = start + size;
VERIFY(size != 0);
- rsearch.rs_start = start;
- rsearch.rs_end = end;
- return (avl_find(&rt->rt_root, &rsearch, NULL));
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end(&rsearch, rt, end);
+ return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
}
range_seg_t *
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
{
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
+
range_seg_t *rs = range_tree_find_impl(rt, start, size);
- if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
+ if (rs != NULL && rs_get_start(rs, rt) <= start &&
+ rs_get_end(rs, rt) >= start + size) {
return (rs);
+ }
return (NULL);
}
@@ -533,24 +606,28 @@ boolean_t
range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
uint64_t *ostart, uint64_t *osize)
{
- range_seg_t rsearch;
- rsearch.rs_start = start;
- rsearch.rs_end = start + 1;
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
- avl_index_t where;
- range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where);
+ range_seg_max_t rsearch;
+ rs_set_start(&rsearch, rt, start);
+ rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1);
+
+ zfs_btree_index_t where;
+ range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
if (rs != NULL) {
*ostart = start;
- *osize = MIN(size, rs->rs_end - start);
+ *osize = MIN(size, rs_get_end(rs, rt) - start);
return (B_TRUE);
}
- rs = avl_nearest(&rt->rt_root, where, AVL_AFTER);
- if (rs == NULL || rs->rs_start > start + size)
+ rs = zfs_btree_next(&rt->rt_root, &where, &where);
+ if (rs == NULL || rs_get_start(rs, rt) > start + size)
return (B_FALSE);
- *ostart = rs->rs_start;
- *osize = MIN(start + size, rs->rs_end) - rs->rs_start;
+ *ostart = rs_get_start(rs, rt);
+ *osize = MIN(start + size, rs_get_end(rs, rt)) -
+ rs_get_start(rs, rt);
return (B_TRUE);
}
@@ -566,9 +643,12 @@ range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
if (size == 0)
return;
+ if (rt->rt_type == RANGE_SEG64)
+ ASSERT3U(start + size, >, start);
+
while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
- uint64_t free_start = MAX(rs->rs_start, start);
- uint64_t free_end = MIN(rs->rs_end, start + size);
+ uint64_t free_start = MAX(rs_get_start(rs, rt), start);
+ uint64_t free_end = MIN(rs_get_end(rs, rt), start + size);
range_tree_remove(rt, free_start, free_end - free_start);
}
}
@@ -579,7 +659,7 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
range_tree_t *rt;
ASSERT0(range_tree_space(*rtdst));
- ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
+ ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
rt = *rtsrc;
*rtsrc = *rtdst;
@@ -589,16 +669,20 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
void
range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
- range_seg_t *rs;
- void *cookie = NULL;
-
if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
- while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
- if (func != NULL)
- func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
- kmem_cache_free(range_seg_cache, rs);
+ if (func != NULL) {
+ range_seg_t *rs;
+ zfs_btree_index_t *cookie = NULL;
+
+ while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
+ NULL) {
+ func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
+ }
+ } else {
+ zfs_btree_clear(&rt->rt_root);
}
bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
@@ -608,16 +692,18 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
void
range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
- for (range_seg_t *rs = avl_first(&rt->rt_root); rs;
- rs = AVL_NEXT(&rt->rt_root, rs)) {
- func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+ zfs_btree_index_t where;
+ for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
+ rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+ func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+ rs_get_start(rs, rt));
}
}
range_seg_t *
range_tree_first(range_tree_t *rt)
{
- return (avl_first(&rt->rt_root));
+ return (zfs_btree_first(&rt->rt_root, NULL));
}
uint64_t
@@ -629,7 +715,7 @@ range_tree_space(range_tree_t *rt)
uint64_t
range_tree_numsegs(range_tree_t *rt)
{
- return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root));
+ return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
}
boolean_t
@@ -639,71 +725,76 @@ range_tree_is_empty(range_tree_t *rt)
return (range_tree_space(rt) == 0);
}
-/* Generic range tree functions for maintaining segments in an AVL tree. */
+/* ARGSUSED */
void
-rt_avl_create(range_tree_t *rt, void *arg)
-{
- avl_tree_t *tree = arg;
-
- avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
- offsetof(range_seg_t, rs_pp_node));
+rt_btree_create(range_tree_t *rt, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+
+ size_t size;
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ size = sizeof (range_seg32_t);
+ break;
+ case RANGE_SEG64:
+ size = sizeof (range_seg64_t);
+ break;
+ case RANGE_SEG_GAP:
+ size = sizeof (range_seg_gap_t);
+ break;
+ default:
+ panic("Invalid range seg type %d", rt->rt_type);
+ }
+ zfs_btree_create(size_tree, rt->rt_btree_compare, size);
}
+/* ARGSUSED */
void
-rt_avl_destroy(range_tree_t *rt, void *arg)
+rt_btree_destroy(range_tree_t *rt, void *arg)
{
- avl_tree_t *tree = arg;
+ zfs_btree_t *size_tree = arg;
+ ASSERT0(zfs_btree_numnodes(size_tree));
- ASSERT0(avl_numnodes(tree));
- avl_destroy(tree);
+ zfs_btree_destroy(size_tree);
}
+/* ARGSUSED */
void
-rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
- avl_tree_t *tree = arg;
- avl_add(tree, rs);
-}
+ zfs_btree_t *size_tree = arg;
-void
-rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
- avl_tree_t *tree = arg;
- avl_remove(tree, rs);
+ zfs_btree_add(size_tree, rs);
}
+/* ARGSUSED */
void
-rt_avl_vacate(range_tree_t *rt, void *arg)
+rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
- /*
- * Normally one would walk the tree freeing nodes along the way.
- * Since the nodes are shared with the range trees we can avoid
- * walking all nodes and just reinitialize the avl tree. The nodes
- * will be freed by the range tree, so we don't want to free them here.
- */
- rt_avl_create(rt, arg);
-}
+ zfs_btree_t *size_tree = arg;
-uint64_t
-range_tree_min(range_tree_t *rt)
-{
- range_seg_t *rs = avl_first(&rt->rt_root);
- return (rs != NULL ? rs->rs_start : 0);
+ zfs_btree_remove(size_tree, rs);
}
-uint64_t
-range_tree_max(range_tree_t *rt)
+/* ARGSUSED */
+void
+rt_btree_vacate(range_tree_t *rt, void *arg)
{
- range_seg_t *rs = avl_last(&rt->rt_root);
- return (rs != NULL ? rs->rs_end : 0);
-}
+ zfs_btree_t *size_tree = arg;
+ zfs_btree_clear(size_tree);
+ zfs_btree_destroy(size_tree);
-uint64_t
-range_tree_span(range_tree_t *rt)
-{
- return (range_tree_max(rt) - range_tree_min(rt));
+ rt_btree_create(rt, arg);
}
+range_tree_ops_t rt_btree_ops = {
+ .rtop_create = rt_btree_create,
+ .rtop_destroy = rt_btree_destroy,
+ .rtop_add = rt_btree_add,
+ .rtop_remove = rt_btree_remove,
+ .rtop_vacate = rt_btree_vacate
+};
+
/*
* Remove any overlapping ranges between the given segment [start, end)
* from removefrom. Add non-overlapping leftovers to addto.
@@ -712,42 +803,62 @@ void
range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
range_tree_t *removefrom, range_tree_t *addto)
{
- avl_index_t where;
- range_seg_t starting_rs = {
- .rs_start = start,
- .rs_end = start + 1
- };
+ zfs_btree_index_t where;
+ range_seg_max_t starting_rs;
+ rs_set_start(&starting_rs, removefrom, start);
+ rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs,
+ removefrom) + 1);
- range_seg_t *curr = avl_find(&removefrom->rt_root,
+ range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
&starting_rs, &where);
if (curr == NULL)
- curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER);
+ curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
range_seg_t *next;
for (; curr != NULL; curr = next) {
- next = AVL_NEXT(&removefrom->rt_root, curr);
-
if (start == end)
return;
VERIFY3U(start, <, end);
/* there is no overlap */
- if (end <= curr->rs_start) {
+ if (end <= rs_get_start(curr, removefrom)) {
range_tree_add(addto, start, end - start);
return;
}
- uint64_t overlap_start = MAX(curr->rs_start, start);
- uint64_t overlap_end = MIN(curr->rs_end, end);
+ uint64_t overlap_start = MAX(rs_get_start(curr, removefrom),
+ start);
+ uint64_t overlap_end = MIN(rs_get_end(curr, removefrom),
+ end);
uint64_t overlap_size = overlap_end - overlap_start;
ASSERT3S(overlap_size, >, 0);
+ range_seg_max_t rs;
+ rs_copy(curr, &rs, removefrom);
+
range_tree_remove(removefrom, overlap_start, overlap_size);
if (start < overlap_start)
range_tree_add(addto, start, overlap_start - start);
start = overlap_end;
+ next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
+ /*
+ * If we find something here, we only removed part of the
+ * curr segment. Either there's some left at the end
+ * because we've reached the end of the range we're removing,
+ * or there's some left at the start because we started
+ * partway through the range. Either way, we continue with
+ * the loop. If it's the former, we'll return at the start of
+ * the loop, and if it's the latter we'll see if there is more
+ * area to process.
+ */
+ if (next != NULL) {
+ ASSERT(start == end || start == rs_get_end(&rs,
+ removefrom));
+ }
+
+ next = zfs_btree_next(&removefrom->rt_root, &where, &where);
}
VERIFY3P(curr, ==, NULL);
@@ -767,9 +878,30 @@ void
range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
range_tree_t *addto)
{
- for (range_seg_t *rs = avl_first(&rt->rt_root); rs;
- rs = AVL_NEXT(&rt->rt_root, rs)) {
- range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end,
- removefrom, addto);
+ zfs_btree_index_t where;
+ for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
+ rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+ range_tree_remove_xor_add_segment(rs_get_start(rs, rt),
+ rs_get_end(rs, rt), removefrom, addto);
}
}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+ range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
+ return (rs != NULL ? rs_get_start(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+ range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
+ return (rs != NULL ? rs_get_end(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+ return (range_tree_max(rt) - range_tree_min(rt));
+}
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index 98bd2d330..cc2a681bd 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -252,7 +252,7 @@ layout_num_compare(const void *arg1, const void *arg2)
const sa_lot_t *node1 = (const sa_lot_t *)arg1;
const sa_lot_t *node2 = (const sa_lot_t *)arg2;
- return (AVL_CMP(node1->lot_num, node2->lot_num));
+ return (TREE_CMP(node1->lot_num, node2->lot_num));
}
static int
@@ -261,11 +261,11 @@ layout_hash_compare(const void *arg1, const void *arg2)
const sa_lot_t *node1 = (const sa_lot_t *)arg1;
const sa_lot_t *node2 = (const sa_lot_t *)arg2;
- int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
+ int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
if (likely(cmp))
return (cmp);
- return (AVL_CMP(node1->lot_instance, node2->lot_instance));
+ return (TREE_CMP(node1->lot_instance, node2->lot_instance));
}
boolean_t
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 7e5c474eb..529ffc03d 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -939,7 +939,7 @@ spa_error_entry_compare(const void *a, const void *b)
ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
sizeof (zbookmark_phys_t));
- return (AVL_ISIGN(ret));
+ return (TREE_ISIGN(ret));
}
/*
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index ac0dd5ba9..78bf110b4 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -58,6 +58,7 @@
#include <sys/ddt.h>
#include <sys/kstat.h>
#include "zfs_prop.h"
+#include <sys/btree.h>
#include <sys/zfeature.h>
#include <sys/qat.h>
@@ -619,7 +620,7 @@ spa_log_sm_sort_by_txg(const void *va, const void *vb)
const spa_log_sm_t *a = va;
const spa_log_sm_t *b = vb;
- return (AVL_CMP(a->sls_txg, b->sls_txg));
+ return (TREE_CMP(a->sls_txg, b->sls_txg));
}
/*
@@ -939,7 +940,7 @@ spa_aux_compare(const void *a, const void *b)
const spa_aux_t *sa = (const spa_aux_t *)a;
const spa_aux_t *sb = (const spa_aux_t *)b;
- return (AVL_CMP(sa->aux_guid, sb->aux_guid));
+ return (TREE_CMP(sa->aux_guid, sb->aux_guid));
}
void
@@ -2270,7 +2271,7 @@ spa_name_compare(const void *a1, const void *a2)
s = strcmp(s1->spa_name, s2->spa_name);
- return (AVL_ISIGN(s));
+ return (TREE_ISIGN(s));
}
void
@@ -2318,8 +2319,8 @@ spa_init(int mode)
fm_init();
zfs_refcount_init();
unique_init();
- range_tree_init();
- metaslab_alloc_trace_init();
+ zfs_btree_init();
+ metaslab_stat_init();
ddt_init();
zio_init();
dmu_init();
@@ -2353,8 +2354,8 @@ spa_fini(void)
dmu_fini();
zio_fini();
ddt_fini();
- metaslab_alloc_trace_fini();
- range_tree_fini();
+ metaslab_stat_fini();
+ zfs_btree_fini();
unique_fini();
zfs_refcount_fini();
fm_fini();
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index b9467b26b..eb2c36942 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -523,8 +523,9 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
* dbuf must be dirty for the changes in sm_phys to take effect.
*/
static void
-space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
- uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
+ maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
+ void *tag, dmu_tx_t *tx)
{
ASSERT3U(words, !=, 0);
ASSERT3U(words, <=, 2);
@@ -548,14 +549,14 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
ASSERT3P(block_cursor, <=, block_end);
- uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
- uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+ uint64_t size = (rend - rstart) >> sm->sm_shift;
+ uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift;
uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
- ASSERT3U(rs->rs_start, >=, sm->sm_start);
- ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
- ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
- ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+ ASSERT3U(rstart, >=, sm->sm_start);
+ ASSERT3U(rstart, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(rend - rstart, <=, sm->sm_size);
+ ASSERT3U(rend, <=, sm->sm_start + sm->sm_size);
while (size != 0) {
ASSERT3P(block_cursor, <=, block_end);
@@ -673,10 +674,14 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
dmu_buf_will_dirty(db, tx);
- avl_tree_t *t = &rt->rt_root;
- for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
- uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
- uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ zfs_btree_t *t = &rt->rt_root;
+ zfs_btree_index_t where;
+ for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL;
+ rs = zfs_btree_next(t, &where, &where)) {
+ uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >>
+ sm->sm_shift;
+ uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >>
+ sm->sm_shift;
uint8_t words = 1;
/*
@@ -701,8 +706,8 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
spa_get_random(100) == 0)))
words = 2;
- space_map_write_seg(sm, rs, maptype, vdev_id, words,
- &db, FTAG, tx);
+ space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs,
+ rt), maptype, vdev_id, words, &db, FTAG, tx);
}
dmu_buf_rele(db, FTAG);
@@ -749,7 +754,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
else
sm->sm_phys->smp_alloc -= range_tree_space(rt);
- uint64_t nodes = avl_numnodes(&rt->rt_root);
+ uint64_t nodes = zfs_btree_numnodes(&rt->rt_root);
uint64_t rt_space = range_tree_space(rt);
space_map_write_impl(sm, rt, maptype, vdev_id, tx);
@@ -758,7 +763,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* Ensure that the space_map's accounting wasn't changed
* while we were in the middle of writing it out.
*/
- VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
+ VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root));
VERIFY3U(range_tree_space(rt), ==, rt_space);
}
diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c
index aa289ba10..080fc6646 100644
--- a/module/zfs/space_reftree.c
+++ b/module/zfs/space_reftree.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -57,11 +57,11 @@ space_reftree_compare(const void *x1, const void *x2)
const space_ref_t *sr1 = (const space_ref_t *)x1;
const space_ref_t *sr2 = (const space_ref_t *)x2;
- int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
+ int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset);
if (likely(cmp))
return (cmp);
- return (AVL_PCMP(sr1, sr2));
+ return (TREE_PCMP(sr1, sr2));
}
void
@@ -109,10 +109,13 @@ space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
void
space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
{
- range_seg_t *rs;
+ zfs_btree_index_t where;
- for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
- space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
+ for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs =
+ zfs_btree_next(&rt->rt_root, &where, &where)) {
+ space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs,
+ rt), refcnt);
+ }
}
/*
diff --git a/module/zfs/unique.c b/module/zfs/unique.c
index 5cdd025f4..0e076797a 100644
--- a/module/zfs/unique.c
+++ b/module/zfs/unique.c
@@ -45,7 +45,7 @@ unique_compare(const void *a, const void *b)
const unique_t *una = (const unique_t *)a;
const unique_t *unb = (const unique_t *)b;
- return (AVL_CMP(una->un_value, unb->un_value));
+ return (TREE_CMP(una->un_value, unb->un_value));
}
void
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index af2d1a25a..3a120b001 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -216,7 +216,7 @@ vdev_getops(const char *type)
/* ARGSUSED */
void
-vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
{
res->rs_start = in->rs_start;
res->rs_end = in->rs_end;
@@ -528,7 +528,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
- vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
+ vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
/*
* Initialize rate limit structs for events. We rate limit ZIO delay
@@ -561,7 +562,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
- vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
+ vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+ 0);
}
txg_list_create(&vd->vdev_ms_list, spa,
offsetof(struct metaslab, ms_txg_node));
@@ -2539,14 +2541,11 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
static uint64_t
vdev_dtl_min(vdev_t *vd)
{
- range_seg_t *rs;
-
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
ASSERT0(vd->vdev_children);
- rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
- return (rs->rs_start - 1);
+ return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
}
/*
@@ -2555,14 +2554,11 @@ vdev_dtl_min(vdev_t *vd)
static uint64_t
vdev_dtl_max(vdev_t *vd)
{
- range_seg_t *rs;
-
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
ASSERT0(vd->vdev_children);
- rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
- return (rs->rs_end);
+ return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
}
/*
@@ -2883,7 +2879,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
ASSERT(vd->vdev_dtl_sm != NULL);
}
- rtsync = range_tree_create(NULL, NULL);
+ rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
mutex_enter(&vd->vdev_dtl_lock);
range_tree_walk(rt, range_tree_add, rtsync);
@@ -4730,7 +4726,8 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
* translation function to do the real conversion.
*/
void
-vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs)
{
/*
* Walk up the vdev tree
@@ -4757,7 +4754,7 @@ vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
* range into its physical components by calling the
* vdev specific translate function.
*/
- range_seg_t intermediate = { { { 0, 0 } } };
+ range_seg64_t intermediate = { 0 };
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
physical_rs->rs_start = intermediate.rs_start;
diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c
index 35a5b480b..6cf449d4a 100644
--- a/module/zfs/vdev_cache.c
+++ b/module/zfs/vdev_cache.c
@@ -111,7 +111,7 @@ vdev_cache_offset_compare(const void *a1, const void *a2)
const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
- return (AVL_CMP(ve1->ve_offset, ve2->ve_offset));
+ return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
}
static int
@@ -120,7 +120,7 @@ vdev_cache_lastused_compare(const void *a1, const void *a2)
const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
- int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused);
+ int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
if (likely(cmp))
return (cmp);
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index 169b9282d..a27ab3c25 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -291,11 +291,13 @@ vdev_initialize_block_free(abd_t *data)
static int
vdev_initialize_ranges(vdev_t *vd, abd_t *data)
{
- avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+ range_tree_t *rt = vd->vdev_initialize_tree;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t where;
- for (range_seg_t *rs = avl_first(rt); rs != NULL;
- rs = AVL_NEXT(rt, rs)) {
- uint64_t size = rs->rs_end - rs->rs_start;
+ for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
+ rs = zfs_btree_next(bt, &where, &where)) {
+ uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
/* Split range into legally-sized physical chunks */
uint64_t writes_required =
@@ -305,7 +307,7 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
int error;
error = vdev_initialize_write(vd,
- VDEV_LABEL_START_SIZE + rs->rs_start +
+ VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
(w * zfs_initialize_chunk_size),
MIN(size - (w * zfs_initialize_chunk_size),
zfs_initialize_chunk_size), data);
@@ -341,7 +343,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
- range_seg_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -365,10 +367,14 @@ vdev_initialize_calculate_progress(vdev_t *vd)
*/
VERIFY0(metaslab_load(msp));
- for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
- rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
- logical_rs.rs_start = rs->rs_start;
- logical_rs.rs_end = rs->rs_end;
+ zfs_btree_index_t where;
+ range_tree_t *rt = msp->ms_allocatable;
+ for (range_seg_t *rs =
+ zfs_btree_first(&rt->rt_root, &where); rs;
+ rs = zfs_btree_next(&rt->rt_root, &where,
+ &where)) {
+ logical_rs.rs_start = rs_get_start(rs, rt);
+ logical_rs.rs_end = rs_get_end(rs, rt);
vdev_xlate(vd, &logical_rs, &physical_rs);
uint64_t size = physical_rs.rs_end -
@@ -422,7 +428,7 @@ void
vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
{
vdev_t *vd = arg;
- range_seg_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
@@ -481,7 +487,8 @@ vdev_initialize_thread(void *arg)
abd_t *deadbeef = vdev_initialize_block_alloc();
- vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+ vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
for (uint64_t i = 0; !vd->vdev_detached &&
i < vd->vdev_top->vdev_ms_count; i++) {
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 6320732ed..6bb3c3c68 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1197,12 +1197,12 @@ retry:
static int
vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
{
- int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+ int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
if (likely(cmp))
return (cmp);
- cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+ cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
if (likely(cmp))
return (cmp);
@@ -1226,7 +1226,7 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
seq2 = MMP_SEQ(ub2);
- return (AVL_CMP(seq1, seq2));
+ return (TREE_CMP(seq1, seq2));
}
struct ubl_cbdata {
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 5e4db3df6..e156e2b01 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -218,12 +218,12 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
- int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
+ int cmp = TREE_CMP(z1->io_offset, z2->io_offset);
if (likely(cmp))
return (cmp);
- return (AVL_PCMP(z1, z2));
+ return (TREE_PCMP(z1, z2));
}
static inline avl_tree_t *
@@ -250,12 +250,12 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2)
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
- int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp);
+ int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
if (likely(cmp))
return (cmp);
- return (AVL_PCMP(z1, z2));
+ return (TREE_PCMP(z1, z2));
}
static int
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index f63ccaa94..3408ddf72 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
*/
@@ -1638,7 +1638,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
- range_seg_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs;
logical_rs.rs_start = zio->io_offset;
logical_rs.rs_end = logical_rs.rs_start +
vdev_raidz_asize(zio->io_vd, zio->io_size);
@@ -2372,7 +2372,7 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
}
static void
-vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
{
vdev_t *raidvd = cvd->vdev_parent;
ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 549087163..a4fac1cc5 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -198,11 +198,12 @@ spa_vdev_removal_create(vdev_t *vd)
spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
- svr->svr_allocd_segs = range_tree_create(NULL, NULL);
+ svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
svr->svr_vdev_id = vd->vdev_id;
for (int i = 0; i < TXG_SIZE; i++) {
- svr->svr_frees[i] = range_tree_create(NULL, NULL);
+ svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
list_create(&svr->svr_new_segments[i],
sizeof (vdev_indirect_mapping_entry_t),
offsetof(vdev_indirect_mapping_entry_t, vime_node));
@@ -967,18 +968,15 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
* the allocation at the end of a segment, thus avoiding
* additional split blocks.
*/
- range_seg_t search;
- avl_index_t where;
- search.rs_start = start + maxalloc;
- search.rs_end = search.rs_start;
- range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
- if (rs == NULL) {
- rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
- } else {
- rs = AVL_PREV(&segs->rt_root, rs);
- }
+ range_seg_max_t search;
+ zfs_btree_index_t where;
+ rs_set_start(&search, segs, start + maxalloc);
+ rs_set_end(&search, segs, start + maxalloc);
+ (void) zfs_btree_find(&segs->rt_root, &search, &where);
+ range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
+ &where);
if (rs != NULL) {
- size = rs->rs_end - start;
+ size = rs_get_end(rs, segs) - start;
} else {
/*
* There are no segments that end before maxalloc.
@@ -1011,20 +1009,22 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
* relative to the start of the range to be copied (i.e. relative to the
* local variable "start").
*/
- range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
-
- range_seg_t *rs = avl_first(&segs->rt_root);
- ASSERT3U(rs->rs_start, ==, start);
- uint64_t prev_seg_end = rs->rs_end;
- while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
- if (rs->rs_start >= start + size) {
+ range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
+
+ zfs_btree_index_t where;
+ range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
+ ASSERT3U(rs_get_start(rs, segs), ==, start);
+ uint64_t prev_seg_end = rs_get_end(rs, segs);
+ while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
+ if (rs_get_start(rs, segs) >= start + size) {
break;
} else {
range_tree_add(obsolete_segs,
prev_seg_end - start,
- rs->rs_start - prev_seg_end);
+ rs_get_start(rs, segs) - prev_seg_end);
}
- prev_seg_end = rs->rs_end;
+ prev_seg_end = rs_get_end(rs, segs);
}
/* We don't end in the middle of an obsolete range */
ASSERT3U(start + size, <=, prev_seg_end);
@@ -1268,9 +1268,10 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
* allocated segments that we are copying. We may also be copying
* free segments (of up to vdev_removal_max_span bytes).
*/
- range_tree_t *segs = range_tree_create(NULL, NULL);
+ range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
for (;;) {
- range_seg_t *rs = range_tree_first(svr->svr_allocd_segs);
+ range_tree_t *rt = svr->svr_allocd_segs;
+ range_seg_t *rs = range_tree_first(rt);
if (rs == NULL)
break;
@@ -1279,17 +1280,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
if (range_tree_is_empty(segs)) {
/* need to truncate the first seg based on max_alloc */
- seg_length =
- MIN(rs->rs_end - rs->rs_start, *max_alloc);
+ seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs,
+ rt), *max_alloc);
} else {
- if (rs->rs_start - range_tree_max(segs) >
+ if (rs_get_start(rs, rt) - range_tree_max(segs) >
vdev_removal_max_span) {
/*
* Including this segment would cause us to
* copy a larger unneeded chunk than is allowed.
*/
break;
- } else if (rs->rs_end - range_tree_min(segs) >
+ } else if (rs_get_end(rs, rt) - range_tree_min(segs) >
*max_alloc) {
/*
* This additional segment would extend past
@@ -1298,13 +1299,14 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
*/
break;
} else {
- seg_length = rs->rs_end - rs->rs_start;
+ seg_length = rs_get_end(rs, rt) -
+ rs_get_start(rs, rt);
}
}
- range_tree_add(segs, rs->rs_start, seg_length);
+ range_tree_add(segs, rs_get_start(rs, rt), seg_length);
range_tree_remove(svr->svr_allocd_segs,
- rs->rs_start, seg_length);
+ rs_get_start(rs, rt), seg_length);
}
if (range_tree_is_empty(segs)) {
@@ -1483,7 +1485,7 @@ spa_vdev_remove_thread(void *arg)
vca.vca_msp = msp;
zfs_dbgmsg("copying %llu segments for metaslab %llu",
- avl_numnodes(&svr->svr_allocd_segs->rt_root),
+ zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root),
msp->ms_id);
while (!svr->svr_thread_exit &&
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 771e7a159..c7c429cbd 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -523,7 +523,8 @@ static int
vdev_trim_ranges(trim_args_t *ta)
{
vdev_t *vd = ta->trim_vdev;
- avl_tree_t *rt = &ta->trim_tree->rt_root;
+ zfs_btree_t *t = &ta->trim_tree->rt_root;
+ zfs_btree_index_t idx;
uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
spa_t *spa = vd->vdev_spa;
@@ -531,9 +532,10 @@ vdev_trim_ranges(trim_args_t *ta)
ta->trim_start_time = gethrtime();
ta->trim_bytes_done = 0;
- for (range_seg_t *rs = avl_first(rt); rs != NULL;
- rs = AVL_NEXT(rt, rs)) {
- uint64_t size = rs->rs_end - rs->rs_start;
+ for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+ rs = zfs_btree_next(t, &idx, &idx)) {
+ uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs,
+ ta->trim_tree);
if (extent_bytes_min && size < extent_bytes_min) {
spa_iostats_trim_add(spa, ta->trim_type,
@@ -548,9 +550,9 @@ vdev_trim_ranges(trim_args_t *ta)
int error;
error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
- rs->rs_start + (w * extent_bytes_max),
- MIN(size - (w * extent_bytes_max),
- extent_bytes_max));
+ rs_get_start(rs, ta->trim_tree) +
+ (w *extent_bytes_max), MIN(size -
+ (w * extent_bytes_max), extent_bytes_max));
if (error != 0) {
return (error);
}
@@ -588,7 +590,7 @@ vdev_trim_calculate_progress(vdev_t *vd)
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
- range_seg_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -611,10 +613,13 @@ vdev_trim_calculate_progress(vdev_t *vd)
*/
VERIFY0(metaslab_load(msp));
- for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
- rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
- logical_rs.rs_start = rs->rs_start;
- logical_rs.rs_end = rs->rs_end;
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_t *bt = &rt->rt_root;
+ zfs_btree_index_t idx;
+ for (range_seg_t *rs = zfs_btree_first(bt, &idx);
+ rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
+ logical_rs.rs_start = rs_get_start(rs, rt);
+ logical_rs.rs_end = rs_get_end(rs, rt);
vdev_xlate(vd, &logical_rs, &physical_rs);
uint64_t size = physical_rs.rs_end -
@@ -706,7 +711,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
{
trim_args_t *ta = arg;
vdev_t *vd = ta->trim_vdev;
- range_seg_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
@@ -719,7 +724,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
metaslab_t *msp = ta->trim_msp;
VERIFY0(metaslab_load(msp));
VERIFY3B(msp->ms_loaded, ==, B_TRUE);
- VERIFY(range_tree_find(msp->ms_allocatable, start, size));
+ VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
}
ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -798,7 +803,7 @@ vdev_trim_thread(void *arg)
ta.trim_vdev = vd;
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
- ta.trim_tree = range_tree_create(NULL, NULL);
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
ta.trim_type = TRIM_TYPE_MANUAL;
ta.trim_flags = 0;
@@ -1080,7 +1085,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
VERIFY3B(msp->ms_loaded, ==, B_TRUE);
VERIFY3U(msp->ms_disabled, >, 0);
- VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL);
+ VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
}
/*
@@ -1178,7 +1183,8 @@ vdev_autotrim_thread(void *arg)
* Allocate an empty range tree which is swapped in
* for the existing ms_trim tree while it is processed.
*/
- trim_tree = range_tree_create(NULL, NULL);
+ trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+ 0, 0);
range_tree_swap(&msp->ms_trim, &trim_tree);
ASSERT(range_tree_is_empty(msp->ms_trim));
@@ -1232,7 +1238,8 @@ vdev_autotrim_thread(void *arg)
if (!cvd->vdev_ops->vdev_op_leaf)
continue;
- ta->trim_tree = range_tree_create(NULL, NULL);
+ ta->trim_tree = range_tree_create(NULL,
+ RANGE_SEG64, NULL, 0, 0);
range_tree_walk(trim_tree,
vdev_trim_range_add, ta);
}
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 467812ff6..c3d42d0ee 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -280,11 +280,11 @@ mze_compare(const void *arg1, const void *arg2)
const mzap_ent_t *mze1 = arg1;
const mzap_ent_t *mze2 = arg2;
- int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
+ int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
if (likely(cmp))
return (cmp);
- return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
+ return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
}
static void
diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c
index 78071707a..925d1934f 100644
--- a/module/zfs/zfs_fuid.c
+++ b/module/zfs/zfs_fuid.c
@@ -73,7 +73,7 @@ idx_compare(const void *arg1, const void *arg2)
const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
- return (AVL_CMP(node1->f_idx, node2->f_idx));
+ return (TREE_CMP(node1->f_idx, node2->f_idx));
}
/*
@@ -88,7 +88,7 @@ domain_compare(const void *arg1, const void *arg2)
val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
- return (AVL_ISIGN(val));
+ return (TREE_ISIGN(val));
}
void
diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c
index 94203a40c..db587c731 100644
--- a/module/zfs/zfs_rlock.c
+++ b/module/zfs/zfs_rlock.c
@@ -109,7 +109,7 @@ zfs_rangelock_compare(const void *arg1, const void *arg2)
const locked_range_t *rl1 = (const locked_range_t *)arg1;
const locked_range_t *rl2 = (const locked_range_t *)arg2;
- return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
+ return (TREE_CMP(rl1->lr_offset, rl2->lr_offset));
}
/*
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 30a73515c..7e65ac090 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -146,11 +146,11 @@ zil_bp_compare(const void *x1, const void *x2)
const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
- int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+ int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
if (likely(cmp))
return (cmp);
- return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
+ return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
}
static void
@@ -535,7 +535,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
- return (AVL_CMP(v1, v2));
+ return (TREE_CMP(v1, v2));
}
static lwb_t *
@@ -1869,7 +1869,7 @@ zil_aitx_compare(const void *x1, const void *x2)
const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
- return (AVL_CMP(o1, o2));
+ return (TREE_CMP(o1, o2));
}
/*