aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys/btree.h
blob: 6e05eee8f01d4d169144e115b23f20b2d0184f2b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/*
 * CDDL HEADER START
 *
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2019 by Delphix. All rights reserved.
 */

#ifndef	_BTREE_H
#define	_BTREE_H

#ifdef	__cplusplus
extern "C" {
#endif

#include	<sys/zfs_context.h>

/*
 * This file defines the interface for a B-Tree implementation for ZFS. The
 * tree can be used to store arbitrary sortable data types with low overhead
 * and good operation performance. In addition the tree intelligently
 * optimizes bulk in-order insertions to improve memory use and performance.
 *
 * Note that for all B-Tree functions, the values returned are pointers to the
 * internal copies of the data in the tree. The internal data can only be
 * safely mutated if the changes cannot change the ordering of the element
 * with respect to any other elements in the tree.
 *
 * The major drawback of the B-Tree is that any returned elements or indexes
 * are only valid until a side-effectful operation occurs, since these can
 * result in reallocation or relocation of data. Side effectful operations are
 * defined as insertion, removal, and zfs_btree_destroy_nodes.
 *
 * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core
 * nodes have an array of children pointing to other nodes, and an array of
 * elements that act as separators between the elements of the subtrees rooted
 * at its children. Leaf nodes only contain data elements, and form the bottom
 * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the
 * elements in the core nodes are not copies of or references to leaf node
 * elements.  Each element occurs only once in the tree, no matter what kind
 * of node it is in.
 *
 * The tree's height is the same throughout, unlike many other forms of search
 * tree. Each node (except for the root) must be between half minus one and
 * completely full of elements (and children) at all times. Any operation that
 * would put the node outside of that range results in a rebalancing operation
 * (taking, merging, or splitting).
 *
 * This tree was implemented using descriptions from Wikipedia's articles on
 * B-Trees and B+ Trees.
 */

/*
 * Decreasing these values results in smaller memmove operations, but more of
 * them, and increased memory overhead. Increasing these values results in
 * higher variance in operation time, and reduces memory overhead.
 */
#define	BTREE_CORE_ELEMS	126
#define	BTREE_LEAF_SIZE		4096

extern kmem_cache_t *zfs_btree_leaf_cache;

typedef struct zfs_btree_hdr {
	struct zfs_btree_core	*bth_parent;
	/*
	 * Set to -1 to indicate core nodes. Other values represent first
	 * valid element offset for leaf nodes.
	 */
	uint32_t		bth_first;
	/*
	 * For both leaf and core nodes, represents the number of elements in
	 * the node. For core nodes, they will have bth_count + 1 children.
	 */
	uint32_t		bth_count;
} zfs_btree_hdr_t;

typedef struct zfs_btree_core {
	zfs_btree_hdr_t	btc_hdr;
	zfs_btree_hdr_t	*btc_children[BTREE_CORE_ELEMS + 1];
	uint8_t		btc_elems[];
} zfs_btree_core_t;

typedef struct zfs_btree_leaf {
	zfs_btree_hdr_t	btl_hdr;
	uint8_t		btl_elems[];
} zfs_btree_leaf_t;

typedef struct zfs_btree_index {
	zfs_btree_hdr_t	*bti_node;
	uint32_t	bti_offset;
	/*
	 * True if the location is before the list offset, false if it's at
	 * the listed offset.
	 */
	boolean_t	bti_before;
} zfs_btree_index_t;

typedef struct btree zfs_btree_t;
typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
    const void *, zfs_btree_index_t *);

struct btree {
	int (*bt_compar) (const void *, const void *);
	bt_find_in_buf_f	bt_find_in_buf;
	size_t			bt_elem_size;
	size_t			bt_leaf_size;
	uint32_t		bt_leaf_cap;
	int32_t			bt_height;
	uint64_t		bt_num_elems;
	uint64_t		bt_num_nodes;
	zfs_btree_hdr_t		*bt_root;
	zfs_btree_leaf_t	*bt_bulk; // non-null if bulk loading
};

/*
 * Implementation of Shar's algorithm designed to accelerate binary search by
 * eliminating impossible to predict branches.
 *
 * For optimality, this should be used to generate the search function in the
 * same file as the comparator  and the comparator should be marked
 * `__attribute__((always_inline) inline` so that the compiler will inline it.
 *
 * Arguments are:
 *
 * NAME   - The function name for this instance of the search function. Use it
 *          in a subsequent call to zfs_btree_create().
 * T      - The element type stored inside the B-Tree.
 * COMP   - A comparator to compare two nodes, it must return exactly: -1, 0,
 *          or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
 *          TREE_CMP() from avl.h can be used in a boilerplate function.
 */
/* BEGIN CSTYLED */
#define	ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP)			\
_Pragma("GCC diagnostic push")						\
_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")			\
static void *								\
NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,			\
    const void *value, zfs_btree_index_t *where)			\
{									\
	T *i = (T *)buf;						\
	(void) tree;							\
	_Pragma("GCC unroll 9")						\
	while (nelems > 1) {						\
		uint32_t half = nelems / 2;				\
		nelems -= half;						\
		i += (COMP(&i[half - 1], value) < 0) * half;		\
	}								\
									\
	int comp = COMP(i, value);					\
	where->bti_offset = (i - (T *)buf) + (comp < 0);		\
	where->bti_before = (comp != 0);				\
									\
	if (comp == 0) {						\
		return (i);						\
	}								\
									\
	return (NULL);							\
}									\
_Pragma("GCC diagnostic pop")
/* END CSTYLED */

/*
 * Allocate and deallocate caches for btree nodes.
 */
void zfs_btree_init(void);
void zfs_btree_fini(void);

/*
 * Initialize an B-Tree. Arguments are:
 *
 * tree   - the tree to be initialized
 * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
 *          -1 for <, 0 for ==, and +1 for >
 * find   - optional function to accelerate searches inside B-Tree nodes
 *          through Shar's algorithm and comparator inlining. Setting this to
 *          NULL will use a generic function. The function should be created
 *          using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
 *          compar should be marked `__attribute__((always_inline)) inline` or
 *          performance is unlikely to improve very much.
 * size   - the value of sizeof(struct my_type)
 * lsize  - custom leaf size
 */
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
    bt_find_in_buf_f, size_t);
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
    bt_find_in_buf_f, size_t, size_t);

/*
 * Find a node with a matching value in the tree. Returns the matching node
 * found. If not found, it returns NULL and then if "where" is not NULL it sets
 * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest().
 *
 * node   - node that has the value being looked for
 * where  - position for use with zfs_btree_nearest() or zfs_btree_add_idx(),
 *          may be NULL
 */
void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *);

/*
 * Insert a node into the tree.
 *
 * node   - the node to insert
 * where  - position as returned from zfs_btree_find()
 */
void zfs_btree_add_idx(zfs_btree_t *, const void *, const zfs_btree_index_t *);

/*
 * Return the first or last valued node in the tree. Will return NULL if the
 * tree is empty. The index can be NULL if the location of the first or last
 * element isn't required.
 */
void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *);
void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *);

/*
 * Return the next or previous valued node in the tree. The second index can
 * safely be NULL, if the location of the next or previous value isn't
 * required.
 */
void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *,
    zfs_btree_index_t *);
void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *,
    zfs_btree_index_t *);

/*
 * Get a value from a tree and an index.
 */
void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *);

/*
 * Add a single value to the tree. The value must not compare equal to any
 * other node already in the tree. Note that the value will be copied out, not
 * inserted directly. It is safe to free or destroy the value once this
 * function returns.
 */
void zfs_btree_add(zfs_btree_t *, const void *);

/*
 * Remove a single value from the tree.  The value must be in the tree. The
 * pointer passed in may be a pointer into a tree-controlled buffer, but it
 * need not be.
 */
void zfs_btree_remove(zfs_btree_t *, const void *);

/*
 * Remove the value at the given location from the tree.
 */
void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *);

/*
 * Return the number of nodes in the tree
 */
ulong_t zfs_btree_numnodes(zfs_btree_t *);

/*
 * Used to destroy any remaining nodes in a tree. The cookie argument should
 * be initialized to NULL before the first call. Returns a node that has been
 * removed from the tree and may be free()'d. Returns NULL when the tree is
 * empty.
 *
 * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it
 * and finally zfs_btree_destroy(). No other B-Tree routines will be valid.
 *
 * cookie - an index used to save state between calls to
 * zfs_btree_destroy_nodes()
 *
 * EXAMPLE:
 *	zfs_btree_t *tree;
 *	struct my_data *node;
 *	zfs_btree_index_t *cookie;
 *
 *	cookie = NULL;
 *	while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
 *		data_destroy(node);
 *	zfs_btree_destroy(tree);
 */
void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **);

/*
 * Destroys all nodes in the tree quickly. This doesn't give the caller an
 * opportunity to iterate over each node and do its own cleanup; for that, use
 * zfs_btree_destroy_nodes().
 */
void zfs_btree_clear(zfs_btree_t *);

/*
 * Final destroy of an B-Tree. Arguments are:
 *
 * tree   - the empty tree to destroy
 */
void zfs_btree_destroy(zfs_btree_t *tree);

/* Runs a variety of self-checks on the btree to verify integrity. */
void zfs_btree_verify(zfs_btree_t *tree);

#ifdef	__cplusplus
}
#endif

#endif	/* _BTREE_H */