aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorRichard Yao <[email protected]>2023-05-26 13:03:12 -0400
committerGitHub <[email protected]>2023-05-26 10:03:12 -0700
commit677c6f8457943fe5b56d7aa8807010a104563e4a (patch)
tree11c2e70b1530c04bf701e7b508cda3545d30f582 /module/zfs
parentbb736d98d133b4449a4e3bb97a914651677e6713 (diff)
btree: Implement faster binary search algorithm
This implements a binary search algorithm for B-Trees that reduces branching to the absolute minimum necessary for a binary search algorithm. It also enables the compiler to inline the comparator to ensure that the only slowdown when doing binary search is from waiting for memory accesses. Additionally, it instructs the compiler to unroll the loop, which gives an additional 40% improve with Clang and 8% improvement with GCC. Consumers must opt into using the faster algorithm. At present, only B-Trees used inside kernel code have been modified to use the faster algorithm. Micro-benchmarks suggest that this can improve binary search performance by up to 3.5 times when compiling with Clang 16 and up to 1.9 times when compiling with GCC 12.2. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Richard Yao <[email protected]> Closes #14866
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/btree.c22
-rw-r--r--module/zfs/dsl_scan.c7
-rw-r--r--module/zfs/metaslab.c23
-rw-r--r--module/zfs/range_tree.c18
-rw-r--r--module/zfs/zap_micro.c6
5 files changed, 59 insertions, 17 deletions
diff --git a/module/zfs/btree.c b/module/zfs/btree.c
index 4c25afaa8..af2b94a85 100644
--- a/module/zfs/btree.c
+++ b/module/zfs/btree.c
@@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
void
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
- size_t size)
+ bt_find_in_buf_f bt_find_in_buf, size_t size)
{
- zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
+ zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
+ BTREE_LEAF_SIZE);
}
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
+ const void *value, zfs_btree_index_t *where);
+
void
zfs_btree_create_custom(zfs_btree_t *tree,
int (*compar) (const void *, const void *),
+ bt_find_in_buf_f bt_find_in_buf,
size_t size, size_t lsize)
{
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
@@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree,
ASSERT3U(size, <=, esize / 2);
memset(tree, 0, sizeof (*tree));
tree->bt_compar = compar;
+ tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
+ zfs_btree_find_in_buf : bt_find_in_buf;
tree->bt_elem_size = size;
tree->bt_leaf_size = lsize;
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
@@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
* element in the last leaf, it's in the last leaf or
* it's not in the tree.
*/
- void *d = zfs_btree_find_in_buf(tree,
+ void *d = tree->bt_find_in_buf(tree,
last_leaf->btl_elems +
last_leaf->btl_hdr.bth_first * size,
last_leaf->btl_hdr.bth_count, value, &idx);
@@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
ASSERT3P(node, !=, NULL);
- void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+ void *d = tree->bt_find_in_buf(tree, node->btc_elems,
node->btc_hdr.bth_count, value, &idx);
EQUIV(d != NULL, !idx.bti_before);
if (d != NULL) {
@@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
*/
zfs_btree_leaf_t *leaf = (depth == 0 ?
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
- void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
+ void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
leaf->btl_hdr.bth_first * size,
leaf->btl_hdr.bth_count, value, &idx);
@@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
zfs_btree_index_t idx;
ASSERT(zfs_btree_is_core(par_hdr));
- VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
par_hdr->bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
uint32_t offset = idx.bti_offset;
@@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
}
zfs_btree_index_t idx;
zfs_btree_core_t *parent = hdr->bth_parent;
- VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 6cad33910..9ee719a5e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -4877,6 +4877,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
* with single operation. Plus it makes scrubs more sequential and reduces
* chances that minor extent change move it within the B-tree.
*/
+__attribute__((always_inline)) inline
static int
ext_size_compare(const void *x, const void *y)
{
@@ -4885,13 +4886,17 @@ ext_size_compare(const void *x, const void *y)
return (TREE_CMP(*a, *b));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
+ ext_size_compare)
+
static void
ext_size_create(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
- zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
+ zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
+ sizeof (uint64_t));
}
static void
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 24d52a749..94b131fcd 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* Comparison function for the private size-ordered tree using 32-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
+__attribute__((always_inline)) inline
static int
metaslab_rangesize32_compare(const void *x1, const void *x2)
{
@@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
- if (likely(cmp))
- return (cmp);
- return (TREE_CMP(r1->rs_start, r2->rs_start));
+ return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
/*
* Comparison function for the private size-ordered tree using 64-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
+__attribute__((always_inline)) inline
static int
metaslab_rangesize64_compare(const void *x1, const void *x2)
{
@@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
- if (likely(cmp))
- return (cmp);
- return (TREE_CMP(r1->rs_start, r2->rs_start));
+ return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
+
typedef struct metaslab_rt_arg {
zfs_btree_t *mra_bt;
uint32_t mra_floor_shift;
@@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt)
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
+ range_seg32_t, metaslab_rangesize32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
+ range_seg64_t, metaslab_rangesize64_compare)
+
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
@@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
size_t size;
int (*compare) (const void *, const void *);
+ bt_find_in_buf_f bt_find;
switch (rt->rt_type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = metaslab_rangesize32_compare;
+ bt_find = metaslab_rt_find_rangesize32_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = metaslab_rangesize64_compare;
+ bt_find = metaslab_rt_find_rangesize64_in_buf;
break;
default:
panic("Invalid range seg type %d", rt->rt_type);
}
- zfs_btree_create(size_tree, compare, size);
+ zfs_btree_create(size_tree, compare, bt_find, size);
mrap->mra_floor_shift = metaslab_by_size_min_shift;
}
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 894c30fca..5174e2c46 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
rt->rt_histogram[idx]--;
}
+__attribute__((always_inline)) inline
static int
range_tree_seg32_compare(const void *x1, const void *x2)
{
@@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+__attribute__((always_inline)) inline
static int
range_tree_seg64_compare(const void *x1, const void *x2)
{
@@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+__attribute__((always_inline)) inline
static int
range_tree_seg_gap_compare(const void *x1, const void *x2)
{
@@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
+ range_tree_seg32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
+ range_tree_seg64_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
+ range_tree_seg_gap_compare)
+
range_tree_t *
range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
void *arg, uint64_t start, uint64_t shift, uint64_t gap)
@@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
size_t size;
int (*compare) (const void *, const void *);
+ bt_find_in_buf_f bt_find;
switch (type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = range_tree_seg32_compare;
+ bt_find = range_tree_seg32_find_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = range_tree_seg64_compare;
+ bt_find = range_tree_seg64_find_in_buf;
break;
case RANGE_SEG_GAP:
size = sizeof (range_seg_gap_t);
compare = range_tree_seg_gap_compare;
+ bt_find = range_tree_seg_gap_find_in_buf;
break;
default:
panic("Invalid range seg type %d", type);
}
- zfs_btree_create(&rt->rt_root, compare, size);
+ zfs_btree_create(&rt->rt_root, compare, bt_find, size);
rt->rt_ops = ops;
rt->rt_gap = gap;
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index d6ad8b2b8..085d9cd8b 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size)
}
}
+__attribute__((always_inline)) inline
static int
mze_compare(const void *arg1, const void *arg2)
{
@@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2)
(uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
+ mze_compare)
+
static void
mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
{
@@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
* 62 entries before we have to add 2KB B-tree core node.
*/
zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
- sizeof (mzap_ent_t), 512);
+ mze_find_in_buf, sizeof (mzap_ent_t), 512);
zap_name_t *zn = zap_name_alloc(zap);
for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {