aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorRob Norris <[email protected]>2023-11-18 21:32:16 +1100
committerBrian Behlendorf <[email protected]>2023-11-28 12:56:43 -0800
commitd702f86eaf5aba13762f436152026ba0befa1a23 (patch)
tree90f161bec3c80b7a9dfe825e2df31ba995aa0e80 /include
parent41c4599cba75c5bb18018fdec022a907e5f14ffa (diff)
brt: lift internal definitions into _impl header
So that zdb (and others!) can get at the BRT on-disk structures. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Kay Pedersen <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes #15541
Diffstat (limited to 'include')
-rw-r--r--include/Makefile.am1
-rw-r--r--include/sys/brt_impl.h199
2 files changed, 200 insertions, 0 deletions
diff --git a/include/Makefile.am b/include/Makefile.am
index 569de6dfa..5f38f6ac6 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -33,6 +33,7 @@ COMMON_H = \
sys/bqueue.h \
sys/btree.h \
sys/brt.h \
+ sys/brt_impl.h \
sys/dataset_kstats.h \
sys/dbuf.h \
sys/ddt.h \
diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h
new file mode 100644
index 000000000..9cc06fbb2
--- /dev/null
+++ b/include/sys/brt_impl.h
@@ -0,0 +1,199 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#ifndef _SYS_BRT_IMPL_H
+#define _SYS_BRT_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * BRT - Block Reference Table.
+ */
+#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
+
+/*
+ * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
+ * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
+ * Each element in this array represents how many BRT entries do we have in this
+ * chunk of storage. We always load this entire array into memory and update as
+ * needed. By having it in memory we can quickly tell (during zio_free()) if
+ * there are any BRT entries that we might need to update.
+ *
+ * This value cannot be larger than 16MB, at least as long as we support
+ * 512 byte block sizes. With 512 byte block size we can have exactly
+ * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
+ * many for a 16bit counter.
+ */
+#define BRT_RANGESIZE (16 * 1024 * 1024)
+_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
+ "BRT_RANGESIZE is too large.");
+/*
+ * We don't want to update the whole structure every time. Maintain bitmap
+ * of dirty blocks within the regions, so that a single bit represents a
+ * block size of entcounts. For example if we have a 1PB vdev then all
+ * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
+ * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
+ * the whole 128MB on disk when we have updated only a single entcount.
+ * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
+ * is represented by a single bit. This gives us 4096 bits. A set bit in the
+ * bitmap means that we had a change in at least one of the 16384 entcounts
+ * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
+ */
+#define BRT_BLOCKSIZE (32 * 1024)
+#define BRT_RANGESIZE_TO_NBLOCKS(size) \
+ (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+
+#define BRT_LITTLE_ENDIAN 0
+#define BRT_BIG_ENDIAN 1
+#ifdef _ZFS_LITTLE_ENDIAN
+#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
+#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
+#else
+#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
+#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
+#endif
+
+typedef struct brt_vdev_phys {
+ uint64_t bvp_mos_entries;
+ uint64_t bvp_size;
+ uint64_t bvp_byteorder;
+ uint64_t bvp_totalcount;
+ uint64_t bvp_rangesize;
+ uint64_t bvp_usedspace;
+ uint64_t bvp_savedspace;
+} brt_vdev_phys_t;
+
+typedef struct brt_vdev {
+ /*
+ * VDEV id.
+ */
+ uint64_t bv_vdevid;
+ /*
+ * Is the structure initiated?
+ * (bv_entcount and bv_bitmap are allocated?)
+ */
+ boolean_t bv_initiated;
+ /*
+ * Object number in the MOS for the entcount array and brt_vdev_phys.
+ */
+ uint64_t bv_mos_brtvdev;
+ /*
+ * Object number in the MOS for the entries table.
+ */
+ uint64_t bv_mos_entries;
+ /*
+ * Entries to sync.
+ */
+ avl_tree_t bv_tree;
+ /*
+ * Does the bv_entcount[] array needs byte swapping?
+ */
+ boolean_t bv_need_byteswap;
+ /*
+ * Number of entries in the bv_entcount[] array.
+ */
+ uint64_t bv_size;
+ /*
+ * This is the array with BRT entry count per BRT_RANGESIZE.
+ */
+ uint16_t *bv_entcount;
+ /*
+ * Sum of all bv_entcount[]s.
+ */
+ uint64_t bv_totalcount;
+ /*
+ * Space on disk occupied by cloned blocks (without compression).
+ */
+ uint64_t bv_usedspace;
+ /*
+ * How much additional space would be occupied without block cloning.
+ */
+ uint64_t bv_savedspace;
+ /*
+ * brt_vdev_phys needs updating on disk.
+ */
+ boolean_t bv_meta_dirty;
+ /*
+ * bv_entcount[] needs updating on disk.
+ */
+ boolean_t bv_entcount_dirty;
+ /*
+ * bv_entcount[] potentially can be a bit too big to sychronize it all
+ * when we just changed few entcounts. The fields below allow us to
+ * track updates to bv_entcount[] array since the last sync.
+ * A single bit in the bv_bitmap represents as many entcounts as can
+ * fit into a single BRT_BLOCKSIZE.
+ * For example we have 65536 entcounts in the bv_entcount array
+ * (so the whole array is 128kB). We updated bv_entcount[2] and
+ * bv_entcount[5]. In that case only first bit in the bv_bitmap will
+ * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
+ */
+ ulong_t *bv_bitmap;
+ uint64_t bv_nblocks;
+} brt_vdev_t;
+
+/*
+ * In-core brt
+ */
+typedef struct brt {
+ krwlock_t brt_lock;
+ spa_t *brt_spa;
+#define brt_mos brt_spa->spa_meta_objset
+ uint64_t brt_rangesize;
+ uint64_t brt_usedspace;
+ uint64_t brt_savedspace;
+ avl_tree_t brt_pending_tree[TXG_SIZE];
+ kmutex_t brt_pending_lock[TXG_SIZE];
+ /* Sum of all entries across all bv_trees. */
+ uint64_t brt_nentries;
+ brt_vdev_t *brt_vdevs;
+ uint64_t brt_nvdevs;
+} brt_t;
+
+/* Size of bre_offset / sizeof (uint64_t). */
+#define BRT_KEY_WORDS (1)
+
+/*
+ * In-core brt entry.
+ * On-disk we use bre_offset as the key and bre_refcount as the value.
+ */
+typedef struct brt_entry {
+ uint64_t bre_offset;
+ uint64_t bre_refcount;
+ avl_node_t bre_node;
+} brt_entry_t;
+
+typedef struct brt_pending_entry {
+ blkptr_t bpe_bp;
+ int bpe_count;
+ avl_node_t bpe_node;
+} brt_pending_entry_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BRT_IMPL_H */