aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/Makefile.am2
-rw-r--r--include/sys/avl.h6
-rw-r--r--include/sys/bitops.h90
-rw-r--r--include/sys/btree.h238
-rw-r--r--include/sys/metaslab.h4
-rw-r--r--include/sys/metaslab_impl.h4
-rw-r--r--include/sys/range_tree.h242
-rw-r--r--include/sys/spa.h46
-rw-r--r--include/sys/space_reftree.h2
-rw-r--r--include/sys/vdev.h4
-rw-r--r--include/sys/vdev_impl.h8
11 files changed, 564 insertions, 82 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 734d5c161..5785bfd55 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -7,10 +7,12 @@ COMMON_H = \
$(top_srcdir)/include/sys/arc_impl.h \
$(top_srcdir)/include/sys/avl.h \
$(top_srcdir)/include/sys/avl_impl.h \
+ $(top_srcdir)/include/sys/bitops.h \
$(top_srcdir)/include/sys/blkptr.h \
$(top_srcdir)/include/sys/bplist.h \
$(top_srcdir)/include/sys/bpobj.h \
$(top_srcdir)/include/sys/bptree.h \
+ $(top_srcdir)/include/sys/btree.h \
$(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/cityhash.h \
$(top_srcdir)/include/sys/dataset_kstats.h \
diff --git a/include/sys/avl.h b/include/sys/avl.h
index 962e8b1cf..6c4e7ed5c 100644
--- a/include/sys/avl.h
+++ b/include/sys/avl.h
@@ -108,9 +108,9 @@ extern "C" {
/*
* AVL comparator helpers
*/
-#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
-#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
-#define AVL_PCMP(a, b) \
+#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define TREE_PCMP(a, b) \
(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
/*
diff --git a/include/sys/bitops.h b/include/sys/bitops.h
new file mode 100644
index 000000000..56d52073b
--- /dev/null
+++ b/include/sys/bitops.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#ifndef _SYS_BITOPS_H
+#define _SYS_BITOPS_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1U << (len)); \
+ ASSERT3U(low + len, <=, 32); \
+ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BF64_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1ULL << (len)); \
+ ASSERT3U(low + len, <=, 64); \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+/*
+ * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in
+ * the case where val is a constant. We can't fix ASSERT because it's used as
+ * an expression in several places in the kernel; as a result, changing it to
+ * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option.
+ */
+#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BITOPS_H */
diff --git a/include/sys/btree.h b/include/sys/btree.h
new file mode 100644
index 000000000..1673b5217
--- /dev/null
+++ b/include/sys/btree.h
@@ -0,0 +1,238 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _BTREE_H
+#define _BTREE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+
+/*
+ * This file defines the interface for a B-Tree implementation for ZFS. The
+ * tree can be used to store arbitrary sortable data types with low overhead
+ * and good operation performance. In addition the tree intelligently
+ * optimizes bulk in-order insertions to improve memory use and performance.
+ *
+ * Note that for all B-Tree functions, the values returned are pointers to the
+ * internal copies of the data in the tree. The internal data can only be
+ * safely mutated if the changes cannot change the ordering of the element
+ * with respect to any other elements in the tree.
+ *
+ * The major drawback of the B-Tree is that any returned elements or indexes
+ * are only valid until a side-effectful operation occurs, since these can
+ * result in reallocation or relocation of data. Side effectful operations are
+ * defined as insertion, removal, and zfs_btree_destroy_nodes.
+ *
+ * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core
+ * nodes have an array of children pointing to other nodes, and an array of
+ * elements that act as separators between the elements of the subtrees rooted
+ * at its children. Leaf nodes only contain data elements, and form the bottom
+ * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the
+ * elements in the core nodes are not copies of or references to leaf node
+ * elements. Each element occcurs only once in the tree, no matter what kind
+ * of node it is in.
+ *
+ * The tree's height is the same throughout, unlike many other forms of search
+ * tree. Each node (except for the root) must be between half minus one and
+ * completely full of elements (and children) at all times. Any operation that
+ * would put the node outside of that range results in a rebalancing operation
+ * (taking, merging, or splitting).
+ *
+ * This tree was implemented using descriptions from Wikipedia's articles on
+ * B-Trees and B+ Trees.
+ */
+
+/*
+ * Decreasing these values results in smaller memmove operations, but more of
+ * them, and increased memory overhead. Increasing these values results in
+ * higher variance in operation time, and reduces memory overhead.
+ */
+#define BTREE_CORE_ELEMS 128
+#define BTREE_LEAF_SIZE 4096
+
+extern kmem_cache_t *zfs_btree_leaf_cache;
+
+typedef struct zfs_btree_hdr {
+ struct zfs_btree_core *bth_parent;
+ boolean_t bth_core;
+ /*
+ * For both leaf and core nodes, represents the number of elements in
+ * the node. For core nodes, they will have bth_count + 1 children.
+ */
+ uint32_t bth_count;
+} zfs_btree_hdr_t;
+
+typedef struct zfs_btree_core {
+ zfs_btree_hdr_t btc_hdr;
+ zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1];
+ uint8_t btc_elems[];
+} zfs_btree_core_t;
+
+typedef struct zfs_btree_leaf {
+ zfs_btree_hdr_t btl_hdr;
+ uint8_t btl_elems[];
+} zfs_btree_leaf_t;
+
+typedef struct zfs_btree_index {
+ zfs_btree_hdr_t *bti_node;
+ uint64_t bti_offset;
+ /*
+ * True if the location is before the list offset, false if it's at
+ * the listed offset.
+ */
+ boolean_t bti_before;
+} zfs_btree_index_t;
+
+typedef struct btree {
+ zfs_btree_hdr_t *bt_root;
+ int64_t bt_height;
+ size_t bt_elem_size;
+ uint64_t bt_num_elems;
+ uint64_t bt_num_nodes;
+ zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
+ int (*bt_compar) (const void *, const void *);
+} zfs_btree_t;
+
+/*
+ * Allocate and deallocate caches for btree nodes.
+ */
+void zfs_btree_init(void);
+void zfs_btree_fini(void);
+
+/*
+ * Initialize an B-Tree. Arguments are:
+ *
+ * tree - the tree to be initialized
+ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
+ * -1 for <, 0 for ==, and +1 for >
+ * size - the value of sizeof(struct my_type)
+ */
+void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
+ size_t);
+
+/*
+ * Find a node with a matching value in the tree. Returns the matching node
+ * found. If not found, it returns NULL and then if "where" is not NULL it sets
+ * "where" for use with zfs_btree_insert() or zfs_btree_nearest().
+ *
+ * node - node that has the value being looked for
+ * where - position for use with zfs_btree_nearest() or zfs_btree_insert(),
+ * may be NULL
+ */
+void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *);
+
+/*
+ * Insert a node into the tree.
+ *
+ * node - the node to insert
+ * where - position as returned from zfs_btree_find()
+ */
+void zfs_btree_insert(zfs_btree_t *, const void *, const zfs_btree_index_t *);
+
+/*
+ * Return the first or last valued node in the tree. Will return NULL
+ * if the tree is empty.
+ */
+void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *);
+void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Return the next or previous valued node in the tree.
+ */
+void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *,
+ zfs_btree_index_t *);
+void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *,
+ zfs_btree_index_t *);
+
+/*
+ * Get a value from a tree and an index.
+ */
+void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Add a single value to the tree. The value must not compare equal to any
+ * other node already in the tree.
+ */
+void zfs_btree_add(zfs_btree_t *, const void *);
+
+/*
+ * Remove a single value from the tree. The value must be in the tree. The
+ * pointer passed in may be a pointer into a tree-controlled buffer, but it
+ * need not be.
+ */
+void zfs_btree_remove(zfs_btree_t *, const void *);
+
+/*
+ * Remove the value at the given location from the tree.
+ */
+void zfs_btree_remove_from(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Return the number of nodes in the tree
+ */
+ulong_t zfs_btree_numnodes(zfs_btree_t *);
+
+/*
+ * Used to destroy any remaining nodes in a tree. The cookie argument should
+ * be initialized to NULL before the first call. Returns a node that has been
+ * removed from the tree and may be free()'d. Returns NULL when the tree is
+ * empty.
+ *
+ * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it
+ * and finally zfs_btree_destroy(). No other B-Tree routines will be valid.
+ *
+ * cookie - an index used to save state between calls to
+ * zfs_btree_destroy_nodes()
+ *
+ * EXAMPLE:
+ * zfs_btree_t *tree;
+ * struct my_data *node;
+ * zfs_btree_index_t *cookie;
+ *
+ * cookie = NULL;
+ * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ * data_destroy(node);
+ * zfs_btree_destroy(tree);
+ */
+void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **);
+
+/*
+ * Destroys all nodes in the tree quickly. This doesn't give the caller an
+ * opportunity to iterate over each node and do its own cleanup; for that, use
+ * zfs_btree_destroy_nodes().
+ */
+void zfs_btree_clear(zfs_btree_t *);
+
+/*
+ * Final destroy of an B-Tree. Arguments are:
+ *
+ * tree - the empty tree to destroy
+ */
+void zfs_btree_destroy(zfs_btree_t *tree);
+
+/* Runs a variety of self-checks on the btree to verify integrity. */
+void zfs_btree_verify(zfs_btree_t *tree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BTREE_H */
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 00b8b4758..f8d9c6a82 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -95,8 +95,8 @@ void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
-void metaslab_alloc_trace_init(void);
-void metaslab_alloc_trace_fini(void);
+void metaslab_stat_init(void);
+void metaslab_stat_fini(void);
void metaslab_trace_init(zio_alloc_list_t *);
void metaslab_trace_fini(zio_alloc_list_t *);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 3ce39183e..d140f741d 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -509,8 +509,8 @@ struct metaslab {
* only difference is that the ms_allocatable_by_size is ordered by
* segment sizes.
*/
- avl_tree_t ms_allocatable_by_size;
- avl_tree_t ms_unflushed_frees_by_size;
+ zfs_btree_t ms_allocatable_by_size;
+ zfs_btree_t ms_unflushed_frees_by_size;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index e299613b8..d80faa933 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -30,7 +30,7 @@
#ifndef _SYS_RANGE_TREE_H
#define _SYS_RANGE_TREE_H
-#include <sys/avl.h>
+#include <sys/btree.h>
#include <sys/dmu.h>
#ifdef __cplusplus
@@ -41,20 +41,35 @@ extern "C" {
typedef struct range_tree_ops range_tree_ops_t;
+typedef enum range_seg_type {
+ RANGE_SEG32,
+ RANGE_SEG64,
+ RANGE_SEG_GAP,
+ RANGE_SEG_NUM_TYPES,
+} range_seg_type_t;
+
/*
* Note: the range_tree may not be accessed concurrently; consumers
* must provide external locking if required.
*/
typedef struct range_tree {
- avl_tree_t rt_root; /* offset-ordered segment AVL tree */
+ zfs_btree_t rt_root; /* offset-ordered segment b-tree */
uint64_t rt_space; /* sum of all segments in the map */
- uint64_t rt_gap; /* allowable inter-segment gap */
+ range_seg_type_t rt_type; /* type of range_seg_t in use */
+ /*
+ * All data that is stored in the range tree must have a start higher
+ * than or equal to rt_start, and all sizes and offsets must be
+ * multiples of 1 << rt_shift.
+ */
+ uint8_t rt_shift;
+ uint64_t rt_start;
range_tree_ops_t *rt_ops;
- /* rt_avl_compare should only be set if rt_arg is an AVL tree */
+ /* rt_btree_compare should only be set if rt_arg is a b-tree */
void *rt_arg;
- int (*rt_avl_compare)(const void *, const void *);
+ int (*rt_btree_compare)(const void *, const void *);
+ uint64_t rt_gap; /* allowable inter-segment gap */
/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -64,36 +79,217 @@ typedef struct range_tree {
uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
} range_tree_t;
-typedef struct range_seg {
- avl_node_t rs_node; /* AVL node */
- avl_node_t rs_pp_node; /* AVL picker-private node */
+typedef struct range_seg32 {
+ uint32_t rs_start; /* starting offset of this segment */
+ uint32_t rs_end; /* ending offset (non-inclusive) */
+} range_seg32_t;
+
+/*
+ * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may
+ * require 64-bit integers for ranges.
+ */
+typedef struct range_seg64 {
+ uint64_t rs_start; /* starting offset of this segment */
+ uint64_t rs_end; /* ending offset (non-inclusive) */
+} range_seg64_t;
+
+typedef struct range_seg_gap {
uint64_t rs_start; /* starting offset of this segment */
uint64_t rs_end; /* ending offset (non-inclusive) */
uint64_t rs_fill; /* actual fill if gap mode is on */
-} range_seg_t;
+} range_seg_gap_t;
+
+/*
+ * This type needs to be the largest of the range segs, since it will be stack
+ * allocated and then cast the actual type to do tree operations.
+ */
+typedef range_seg_gap_t range_seg_max_t;
+
+/*
+ * This is just for clarity of code purposes, so we can make it clear that a
+ * pointer is to a range seg of some type; when we need to do the actual math,
+ * we'll figure out the real type.
+ */
+typedef void range_seg_t;
struct range_tree_ops {
void (*rtop_create)(range_tree_t *rt, void *arg);
void (*rtop_destroy)(range_tree_t *rt, void *arg);
- void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
- void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
+ void (*rtop_add)(range_tree_t *rt, void *rs, void *arg);
+ void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg);
void (*rtop_vacate)(range_tree_t *rt, void *arg);
};
+static inline uint64_t
+rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ return (((range_seg32_t *)rs)->rs_start);
+ case RANGE_SEG64:
+ return (((range_seg64_t *)rs)->rs_start);
+ case RANGE_SEG_GAP:
+ return (((range_seg_gap_t *)rs)->rs_start);
+ default:
+ VERIFY(0);
+ return (0);
+ }
+}
+
+static inline uint64_t
+rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ return (((range_seg32_t *)rs)->rs_end);
+ case RANGE_SEG64:
+ return (((range_seg64_t *)rs)->rs_end);
+ case RANGE_SEG_GAP:
+ return (((range_seg_gap_t *)rs)->rs_end);
+ default:
+ VERIFY(0);
+ return (0);
+ }
+}
+
+static inline uint64_t
+rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32: {
+ const range_seg32_t *r32 = rs;
+ return (r32->rs_end - r32->rs_start);
+ }
+ case RANGE_SEG64: {
+ const range_seg64_t *r64 = rs;
+ return (r64->rs_end - r64->rs_start);
+ }
+ case RANGE_SEG_GAP:
+ return (((range_seg_gap_t *)rs)->rs_fill);
+ default:
+ VERIFY(0);
+ return (0);
+ }
+
+}
+
+static inline uint64_t
+rs_get_start(const range_seg_t *rs, const range_tree_t *rt)
+{
+ return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
+}
+
+static inline uint64_t
+rs_get_end(const range_seg_t *rs, const range_tree_t *rt)
+{
+ return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
+}
+
+static inline uint64_t
+rs_get_fill(const range_seg_t *rs, const range_tree_t *rt)
+{
+ return (rs_get_fill_raw(rs, rt) << rt->rt_shift);
+}
+
+static inline void
+rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ ASSERT3U(start, <=, UINT32_MAX);
+ ((range_seg32_t *)rs)->rs_start = (uint32_t)start;
+ break;
+ case RANGE_SEG64:
+ ((range_seg64_t *)rs)->rs_start = start;
+ break;
+ case RANGE_SEG_GAP:
+ ((range_seg_gap_t *)rs)->rs_start = start;
+ break;
+ default:
+ VERIFY(0);
+ }
+}
+
+static inline void
+rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ ASSERT3U(end, <=, UINT32_MAX);
+ ((range_seg32_t *)rs)->rs_end = (uint32_t)end;
+ break;
+ case RANGE_SEG64:
+ ((range_seg64_t *)rs)->rs_end = end;
+ break;
+ case RANGE_SEG_GAP:
+ ((range_seg_gap_t *)rs)->rs_end = end;
+ break;
+ default:
+ VERIFY(0);
+ }
+}
+
+static inline void
+rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
+{
+ ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ switch (rt->rt_type) {
+ case RANGE_SEG32:
+ /* fall through */
+ case RANGE_SEG64:
+ ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs,
+ rt));
+ break;
+ case RANGE_SEG_GAP:
+ ((range_seg_gap_t *)rs)->rs_fill = fill;
+ break;
+ default:
+ VERIFY(0);
+ }
+}
+
+static inline void
+rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start)
+{
+ ASSERT3U(start, >=, rt->rt_start);
+ ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift));
+ rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift);
+}
+
+static inline void
+rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end)
+{
+ ASSERT3U(end, >=, rt->rt_start);
+ ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift));
+ rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift);
+}
+
+static inline void
+rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
+{
+ ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift));
+ rs_set_fill_raw(rs, rt, fill >> rt->rt_shift);
+}
+
typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
-void range_tree_init(void);
-void range_tree_fini(void);
-range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
- int (*avl_compare) (const void *, const void *), uint64_t gap);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops,
+ range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+ int (*zfs_btree_compare) (const void *, const void *), uint64_t gap);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+ void *arg, uint64_t start, uint64_t shift);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
uint64_t *ostart, uint64_t *osize);
void range_tree_verify_not_present(range_tree_t *rt,
uint64_t start, uint64_t size);
-range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
@@ -120,12 +316,12 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
range_tree_t *addto);
-void rt_avl_create(range_tree_t *rt, void *arg);
-void rt_avl_destroy(range_tree_t *rt, void *arg);
-void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_vacate(range_tree_t *rt, void *arg);
-extern struct range_tree_ops rt_avl_ops;
+void rt_btree_create(range_tree_t *rt, void *arg);
+void rt_btree_destroy(range_tree_t *rt, void *arg);
+void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_btree_vacate(range_tree_t *rt, void *arg);
+extern range_tree_ops_t rt_btree_ops;
#ifdef __cplusplus
}
diff --git a/include/sys/spa.h b/include/sys/spa.h
index caa2157b9..18933e0d4 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -43,6 +43,7 @@
#include <sys/spa_checksum.h>
#include <sys/dmu.h>
#include <sys/space_map.h>
+#include <sys/bitops.h>
#ifdef __cplusplus
extern "C" {
@@ -70,51 +71,6 @@ struct dsl_dataset;
struct dsl_crypto_params;
/*
- * General-purpose 32-bit and 64-bit bitfield encodings.
- */
-#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
-#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
-#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
-#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
-
-#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
-#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
-
-#define BF32_SET(x, low, len, val) do { \
- ASSERT3U(val, <, 1U << (len)); \
- ASSERT3U(low + len, <=, 32); \
- (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
-_NOTE(CONSTCOND) } while (0)
-
-#define BF64_SET(x, low, len, val) do { \
- ASSERT3U(val, <, 1ULL << (len)); \
- ASSERT3U(low + len, <=, 64); \
- ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
-_NOTE(CONSTCOND) } while (0)
-
-#define BF32_GET_SB(x, low, len, shift, bias) \
- ((BF32_GET(x, low, len) + (bias)) << (shift))
-#define BF64_GET_SB(x, low, len, shift, bias) \
- ((BF64_GET(x, low, len) + (bias)) << (shift))
-
-/*
- * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in
- * the case where val is a constant. We can't fix ASSERT because it's used as
- * an expression in several places in the kernel; as a result, changing it to
- * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option.
- */
-#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
- ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \
- ASSERT3S((val) >> (shift), >=, bias); \
- BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
- ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \
- ASSERT3S((val) >> (shift), >=, bias); \
- BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-
-/*
* We currently support block sizes from 512 bytes to 16MB.
* The benefits of larger blocks, and thus larger IO, need to be weighed
* against the cost of COWing a giant block to modify one byte, and the
diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h
index 249b15be6..ca9d41dc1 100644
--- a/include/sys/space_reftree.h
+++ b/include/sys/space_reftree.h
@@ -31,7 +31,7 @@
#define _SYS_SPACE_REFTREE_H
#include <sys/range_tree.h>
-
+#include <sys/avl.h>
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 0d0d0234e..c2fbcc549 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -96,8 +96,8 @@ extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
-extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
- range_seg_t *physical_rs);
+extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index c179191e3..ae82b75c0 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -86,8 +86,8 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
* Given a target vdev, translates the logical range "in" to the physical
* range "res"
*/
-typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
- range_seg_t *res);
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in,
+ range_seg64_t *res);
typedef const struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -526,8 +526,8 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
-extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
- range_seg_t *out);
+extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in,
+ range_seg64_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);