aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c8
-rw-r--r--include/sys/btree.h66
-rw-r--r--module/Kbuild.in14
-rw-r--r--module/Makefile.bsd14
-rw-r--r--module/zfs/btree.c22
-rw-r--r--module/zfs/dsl_scan.c7
-rw-r--r--module/zfs/metaslab.c23
-rw-r--r--module/zfs/range_tree.c18
-rw-r--r--module/zfs/zap_micro.c6
-rw-r--r--tests/zfs-tests/cmd/btree_test.c2
10 files changed, 154 insertions, 26 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 5ab13b470..61f1258f7 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -326,7 +326,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
int err;
struct sublivelist_verify *sv = args;
- zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
+ zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
sizeof (sublivelist_verify_block_refcnt_t));
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
@@ -390,7 +390,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
{
(void) args;
sublivelist_verify_t sv;
- zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
sizeof (sublivelist_verify_block_t));
int err = sublivelist_verify_func(&sv, dle);
zfs_btree_clear(&sv.sv_leftover);
@@ -682,7 +682,7 @@ livelist_metaslab_validate(spa_t *spa)
(void) printf("Verifying deleted livelist entries\n");
sublivelist_verify_t sv;
- zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
sizeof (sublivelist_verify_block_t));
iterate_deleted_livelists(spa, livelist_verify, &sv);
@@ -716,7 +716,7 @@ livelist_metaslab_validate(spa_t *spa)
mv.mv_start = m->ms_start;
mv.mv_end = m->ms_start + m->ms_size;
zfs_btree_create(&mv.mv_livelist_allocs,
- livelist_block_compare,
+ livelist_block_compare, NULL,
sizeof (sublivelist_verify_block_t));
mv_populate_livelist_allocs(&mv, &sv);
diff --git a/include/sys/btree.h b/include/sys/btree.h
index 883abb518..6e05eee8f 100644
--- a/include/sys/btree.h
+++ b/include/sys/btree.h
@@ -105,8 +105,13 @@ typedef struct zfs_btree_index {
boolean_t bti_before;
} zfs_btree_index_t;
-typedef struct btree {
+typedef struct btree zfs_btree_t;
+typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
+ const void *, zfs_btree_index_t *);
+
+struct btree {
int (*bt_compar) (const void *, const void *);
+ bt_find_in_buf_f bt_find_in_buf;
size_t bt_elem_size;
size_t bt_leaf_size;
uint32_t bt_leaf_cap;
@@ -115,7 +120,54 @@ typedef struct btree {
uint64_t bt_num_nodes;
zfs_btree_hdr_t *bt_root;
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
-} zfs_btree_t;
+};
+
+/*
+ * Implementation of Shar's algorithm designed to accelerate binary search by
+ * eliminating impossible to predict branches.
+ *
+ * For optimality, this should be used to generate the search function in the
+ * same file as the comparator and the comparator should be marked
+ * `__attribute__((always_inline) inline` so that the compiler will inline it.
+ *
+ * Arguments are:
+ *
+ * NAME - The function name for this instance of the search function. Use it
+ * in a subsequent call to zfs_btree_create().
+ * T - The element type stored inside the B-Tree.
+ * COMP - A comparator to compare two nodes, it must return exactly: -1, 0,
+ * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
+ * TREE_CMP() from avl.h can be used in a boilerplate function.
+ */
+/* BEGIN CSTYLED */
+#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \
+_Pragma("GCC diagnostic push") \
+_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \
+static void * \
+NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \
+ const void *value, zfs_btree_index_t *where) \
+{ \
+ T *i = (T *)buf; \
+ (void) tree; \
+ _Pragma("GCC unroll 9") \
+ while (nelems > 1) { \
+ uint32_t half = nelems / 2; \
+ nelems -= half; \
+ i += (COMP(&i[half - 1], value) < 0) * half; \
+ } \
+ \
+ int comp = COMP(i, value); \
+ where->bti_offset = (i - (T *)buf) + (comp < 0); \
+ where->bti_before = (comp != 0); \
+ \
+ if (comp == 0) { \
+ return (i); \
+ } \
+ \
+ return (NULL); \
+} \
+_Pragma("GCC diagnostic pop")
+/* END CSTYLED */
/*
* Allocate and deallocate caches for btree nodes.
@@ -129,13 +181,19 @@ void zfs_btree_fini(void);
* tree - the tree to be initialized
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
* -1 for <, 0 for ==, and +1 for >
+ * find - optional function to accelerate searches inside B-Tree nodes
+ * through Shar's algorithm and comparator inlining. Setting this to
+ * NULL will use a generic function. The function should be created
+ * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
+ * compar should be marked `__attribute__((always_inline)) inline` or
+ * performance is unlikely to improve very much.
* size - the value of sizeof(struct my_type)
* lsize - custom leaf size
*/
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
- size_t);
+ bt_find_in_buf_f, size_t);
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
- size_t, size_t);
+ bt_find_in_buf_f, size_t, size_t);
/*
* Find a node with a matching value in the tree. Returns the matching node
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 8d29f56c2..29a55c977 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y)
ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
endif
+# Generated binary search code is particularly bad with this optimization.
+# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
+# is not affected when unrolling is done.
+# Disable it until the following upstream issue is resolved:
+# https://github.com/llvm/llvm-project/issues/62790
+ifeq ($(CONFIG_X86),y)
+ifeq ($(CONFIG_CC_IS_CLANG),y)
+CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
+CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
+CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false
+CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false
+endif
+endif
+
ifneq ($(KBUILD_EXTMOD),)
@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 365609fb8..9464223f6 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -400,6 +400,20 @@ beforeinstall:
.include <bsd.kmod.mk>
+# Generated binary search code is particularly bad with this optimization.
+# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
+# is not affected when unrolling is done.
+# Disable it until the following upstream issue is resolved:
+# https://github.com/llvm/llvm-project/issues/62790
+.if ${CC} == "clang"
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64"
+CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false
+CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false
+CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false
+CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false
+.endif
+.endif
+
CFLAGS.sysctl_os.c= -include ../zfs_config.h
CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h
diff --git a/module/zfs/btree.c b/module/zfs/btree.c
index 4c25afaa8..af2b94a85 100644
--- a/module/zfs/btree.c
+++ b/module/zfs/btree.c
@@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
void
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
- size_t size)
+ bt_find_in_buf_f bt_find_in_buf, size_t size)
{
- zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
+ zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
+ BTREE_LEAF_SIZE);
}
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
+ const void *value, zfs_btree_index_t *where);
+
void
zfs_btree_create_custom(zfs_btree_t *tree,
int (*compar) (const void *, const void *),
+ bt_find_in_buf_f bt_find_in_buf,
size_t size, size_t lsize)
{
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
@@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree,
ASSERT3U(size, <=, esize / 2);
memset(tree, 0, sizeof (*tree));
tree->bt_compar = compar;
+ tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
+ zfs_btree_find_in_buf : bt_find_in_buf;
tree->bt_elem_size = size;
tree->bt_leaf_size = lsize;
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
@@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
* element in the last leaf, it's in the last leaf or
* it's not in the tree.
*/
- void *d = zfs_btree_find_in_buf(tree,
+ void *d = tree->bt_find_in_buf(tree,
last_leaf->btl_elems +
last_leaf->btl_hdr.bth_first * size,
last_leaf->btl_hdr.bth_count, value, &idx);
@@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
ASSERT3P(node, !=, NULL);
- void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+ void *d = tree->bt_find_in_buf(tree, node->btc_elems,
node->btc_hdr.bth_count, value, &idx);
EQUIV(d != NULL, !idx.bti_before);
if (d != NULL) {
@@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
*/
zfs_btree_leaf_t *leaf = (depth == 0 ?
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
- void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
+ void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
leaf->btl_hdr.bth_first * size,
leaf->btl_hdr.bth_count, value, &idx);
@@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
zfs_btree_index_t idx;
ASSERT(zfs_btree_is_core(par_hdr));
- VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
par_hdr->bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
uint32_t offset = idx.bti_offset;
@@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
}
zfs_btree_index_t idx;
zfs_btree_core_t *parent = hdr->bth_parent;
- VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 6cad33910..9ee719a5e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -4877,6 +4877,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
* with single operation. Plus it makes scrubs more sequential and reduces
* chances that minor extent change move it within the B-tree.
*/
+__attribute__((always_inline)) inline
static int
ext_size_compare(const void *x, const void *y)
{
@@ -4885,13 +4886,17 @@ ext_size_compare(const void *x, const void *y)
return (TREE_CMP(*a, *b));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
+ ext_size_compare)
+
static void
ext_size_create(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
- zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
+ zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
+ sizeof (uint64_t));
}
static void
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 24d52a749..94b131fcd 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* Comparison function for the private size-ordered tree using 32-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
+__attribute__((always_inline)) inline
static int
metaslab_rangesize32_compare(const void *x1, const void *x2)
{
@@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
- if (likely(cmp))
- return (cmp);
- return (TREE_CMP(r1->rs_start, r2->rs_start));
+ return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
/*
* Comparison function for the private size-ordered tree using 64-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
+__attribute__((always_inline)) inline
static int
metaslab_rangesize64_compare(const void *x1, const void *x2)
{
@@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
- if (likely(cmp))
- return (cmp);
- return (TREE_CMP(r1->rs_start, r2->rs_start));
+ return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
+
typedef struct metaslab_rt_arg {
zfs_btree_t *mra_bt;
uint32_t mra_floor_shift;
@@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt)
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
+ range_seg32_t, metaslab_rangesize32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
+ range_seg64_t, metaslab_rangesize64_compare)
+
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
@@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
size_t size;
int (*compare) (const void *, const void *);
+ bt_find_in_buf_f bt_find;
switch (rt->rt_type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = metaslab_rangesize32_compare;
+ bt_find = metaslab_rt_find_rangesize32_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = metaslab_rangesize64_compare;
+ bt_find = metaslab_rt_find_rangesize64_in_buf;
break;
default:
panic("Invalid range seg type %d", rt->rt_type);
}
- zfs_btree_create(size_tree, compare, size);
+ zfs_btree_create(size_tree, compare, bt_find, size);
mrap->mra_floor_shift = metaslab_by_size_min_shift;
}
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 894c30fca..5174e2c46 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
rt->rt_histogram[idx]--;
}
+__attribute__((always_inline)) inline
static int
range_tree_seg32_compare(const void *x1, const void *x2)
{
@@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+__attribute__((always_inline)) inline
static int
range_tree_seg64_compare(const void *x1, const void *x2)
{
@@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+__attribute__((always_inline)) inline
static int
range_tree_seg_gap_compare(const void *x1, const void *x2)
{
@@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
+ range_tree_seg32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
+ range_tree_seg64_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
+ range_tree_seg_gap_compare)
+
range_tree_t *
range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
void *arg, uint64_t start, uint64_t shift, uint64_t gap)
@@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
size_t size;
int (*compare) (const void *, const void *);
+ bt_find_in_buf_f bt_find;
switch (type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = range_tree_seg32_compare;
+ bt_find = range_tree_seg32_find_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = range_tree_seg64_compare;
+ bt_find = range_tree_seg64_find_in_buf;
break;
case RANGE_SEG_GAP:
size = sizeof (range_seg_gap_t);
compare = range_tree_seg_gap_compare;
+ bt_find = range_tree_seg_gap_find_in_buf;
break;
default:
panic("Invalid range seg type %d", type);
}
- zfs_btree_create(&rt->rt_root, compare, size);
+ zfs_btree_create(&rt->rt_root, compare, bt_find, size);
rt->rt_ops = ops;
rt->rt_gap = gap;
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index d6ad8b2b8..085d9cd8b 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size)
}
}
+__attribute__((always_inline)) inline
static int
mze_compare(const void *arg1, const void *arg2)
{
@@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2)
(uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
+ mze_compare)
+
static void
mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
{
@@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
* 62 entries before we have to add 2KB B-tree core node.
*/
zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
- sizeof (mzap_ent_t), 512);
+ mze_find_in_buf, sizeof (mzap_ent_t), 512);
zap_name_t *zn = zap_name_alloc(zap);
for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
diff --git a/tests/zfs-tests/cmd/btree_test.c b/tests/zfs-tests/cmd/btree_test.c
index 9a34bf559..fda922991 100644
--- a/tests/zfs-tests/cmd/btree_test.c
+++ b/tests/zfs-tests/cmd/btree_test.c
@@ -501,7 +501,7 @@ main(int argc, char *argv[])
srandom(seed);
zfs_btree_init();
- zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t));
+ zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t));
/*
* This runs the named negative test. None of them should