16 files changed, 3028 insertions, 79 deletions
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
new file mode 100644
index 000000000..ca9c4e678
--- /dev/null
+++ b/module/zfs/brt.c
@@ -0,0 +1,1884 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/brt.h>
+#include <sys/ddt.h>
+#include <sys/bitmap.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/kstat.h>
+#include <sys/wmsum.h>
+
+/*
+ * Block Cloning design.
+ *
+ * Block Cloning allows to manually clone a file (or a subset of its blocks)
+ * into another (or the same) file by just creating additional references to
+ * the data blocks without copying the data itself. Those references are kept
+ * in the Block Reference Tables (BRTs).
+ *
+ * In many ways this is similar to the existing deduplication, but there are
+ * some important differences:
+ *
+ * - Deduplication is automatic and Block Cloning is not - one has to use a
+ *   dedicated system call(s) to clone the given file/blocks.
+ * - Deduplication keeps all data blocks in its table, even those referenced
+ *   just once. Block Cloning creates an entry in its tables only when there
+ *   are at least two references to the given data block. If the block was
+ *   never explicitly cloned or the second to last reference was dropped,
+ *   there will be neither space nor performance overhead.
+ * - Deduplication needs data to work - one needs to pass real data to the
+ *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
+ *   data, just block pointers to the data, so it is extremely fast, as we pay
+ *   neither the cost of reading the data, nor the cost of writing the data -
+ *   we operate exclusively on metadata.
+ * - If the D (dedup) bit is not set in the block pointer, it means that
+ *   the block is not in the dedup table (DDT) and we won't consult the DDT
+ *   when we need to free the block. Block Cloning must be consulted on every
+ *   free, because we cannot modify the source BP (eg. by setting something
+ *   similar to the D bit), thus we have no hint if the block is in the
+ *   Block Reference Table (BRT), so we need to look into the BRT. There is
+ *   an optimization in place that allows us to eliminate the majority of BRT
+ *   lookups which is described below in the "Minimizing free penalty" section.
+ * - The BRT entry is much smaller than the DDT entry - for BRT we only store
+ *   64bit offset and 64bit reference counter.
+ * - Dedup keys are cryptographic hashes, so two blocks that are close to each
+ *   other on disk are most likely in totally different parts of the DDT.
+ *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
+ *   from one file should have BRT entries close to each other.
+ * - Scrub will only do a single pass over a block that is referenced multiple
+ *   times in the DDT. Unfortunately it is not currently (if at all) possible
+ *   with Block Cloning and block referenced multiple times will be scrubbed
+ *   multiple times. The new, sorted scrub should be able to eliminate
+ *   duplicated reads given enough memory.
+ * - Deduplication requires cryptographically strong hash as a checksum or
+ *   additional data verification. Block Cloning works with any checksum
+ *   algorithm or even with checksumming disabled.
+ *
+ * As mentioned above, the BRT entries are much smaller than the DDT entries.
+ * To uniquely identify a block we just need its vdev id and offset. We also
+ * need to maintain a reference counter. The vdev id will often repeat, as there
+ * is a small number of top-level VDEVs and a large number of blocks stored in
+ * each VDEV. We take advantage of that to reduce the BRT entry size further by
+ * maintaining one BRT for each top-level VDEV, so we can then have only offset
+ * and counter as the BRT entry.
+ *
+ * Minimizing free penalty.
+ *
+ * Block Cloning allows creating additional references to any existing block.
+ * When we free a block there is no hint in the block pointer whether the block
+ * was cloned or not, so on each free we have to check if there is a
+ * corresponding entry in the BRT or not. If there is, we need to decrease
+ * the reference counter. Doing BRT lookup on every free can potentially be
+ * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
+ * This is the main problem with deduplication, so we've learned our lesson and
+ * try not to repeat the same mistake here. How do we do that? We divide each
+ * top-level VDEV into 16MB regions. For each region we maintain a counter that
+ * is a sum of all the BRT entries that have offsets within the region. This
+ * creates the entries count array of 16bit numbers for each top-level VDEV.
+ * The entries count array is always kept in memory and updated on disk in the
+ * same transaction group as the BRT updates to keep everything in-sync. We can
+ * keep the array in memory, because it is very small. With 16MB regions and
+ * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
+ * the region size even further in the future). Now, when we want to free
+ * a block, we first consult the array. If the counter for the whole region is
+ * zero, there is no need to look for the BRT entry, as there isn't one for
+ * sure. If the counter for the region is greater than zero, only then we will
+ * do a BRT lookup and if an entry is found we will decrease the reference
+ * counter in the BRT entry and in the entry counters array.
+ *
+ * The entry counters array is small, but can potentially be larger for very
+ * large VDEVs or smaller regions. In this case we don't want to rewrite entire
+ * array on every change. We then divide the array into 32kB block and keep
+ * a bitmap of dirty blocks within a transaction group. When we sync the
+ * transaction group we can only update the parts of the entry counters array
+ * that were modified. Note: Keeping track of the dirty parts of the entry
+ * counters array is implemented, but updating only parts of the array on disk
+ * is not yet implemented - for now we will update entire array if there was
+ * any change.
+ *
+ * The implementation tries to be economic: if BRT is not used, or no longer
+ * used, there will be no entries in the MOS and no additional memory used (eg.
+ * the entry counters array is only allocated if needed).
+ *
+ * Interaction between Deduplication and Block Cloning.
+ *
+ * If both functionalities are in use, we could end up with a block that is
+ * referenced multiple times in both DDT and BRT. When we free one of the
+ * references we couldn't tell where it belongs, so we would have to decide
+ * what table takes the precedence: do we first clear DDT references or BRT
+ * references? To avoid this dilemma BRT cooperates with DDT - if a given block
+ * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
+ * lookup DDT entry instead and increase the counter there. No BRT entry
+ * will be created for a block which has the D (dedup) bit set.
+ * BRT may be more efficient for manual deduplication, but if the block is
+ * already in the DDT, then creating additional BRT entry would be less
+ * efficient. This clever idea was proposed by Allan Jude.
+ *
+ * Block Cloning across datasets.
+ *
+ * Block Cloning is not limited to cloning blocks within the same dataset.
+ * It is possible (and very useful) to clone blocks between different datasets.
+ * One use case is recovering files from snapshots. By cloning the files into
+ * dataset we need no additional storage. Without Block Cloning we would need
+ * additional space for those files.
+ * Another interesting use case is moving the files between datasets
+ * (copying the file content to the new dataset and removing the source file).
+ * In that case Block Cloning will only be used briefly, because the BRT entries
+ * will be removed when the source is removed.
+ * Note: currently it is not possible to clone blocks between encrypted
+ * datasets, even if those datasets use the same encryption key (this includes
+ * snapshots of encrypted datasets). Cloning blocks between datasets that use
+ * the same keys should be possible and should be implemented in the future.
+ *
+ * Block Cloning flow through ZFS layers.
+ *
+ * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
+ * blocks. As of this writing no interface is implemented that allows for block
+ * cloning within a ZVOL.
+ * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
+ * for blocking cloning.
+ *
+ *	ssize_t
+ *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
+ *	                size_t len, unsigned int flags);
+ *
+ * Even though offsets and length represent bytes, they have to be
+ * block-aligned or we will return the EXDEV error so the upper layer can
+ * fallback to the generic mechanism that will just copy the data.
+ * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
+ * This function was implemented based on zfs_write(), but instead of writing
+ * the given data we first read block pointers using the new dmu_read_l0_bps()
+ * function from the source file. Once we have BPs from the source file we call
+ * the dmu_brt_clone() function on the destination file. This function
+ * allocates BPs for us. We iterate over all source BPs. If the given BP is
+ * a hole or an embedded block, we just copy BP as-is. If it points to a real
+ * data we place this BP on a BRT pending list using the brt_pending_add()
+ * function.
+ *
+ * We use this pending list to keep track of all BPs that got new references
+ * within this transaction group.
+ *
+ * Some special cases to consider and how we address them:
+ * - The block we want to clone may have been created within the same
+ *   transaction group that we are trying to clone. Such block has no BP
+ *   allocated yet, so cannot be immediately cloned. We return EXDEV.
+ * - The block we want to clone may have been modified within the same
+ *   transaction group. We return EXDEV.
+ * - A block may be cloned multiple times during one transaction group (that's
+ *   why pending list is actually a tree and not an append-only list - this
+ *   way we can figure out faster if this block is cloned for the first time
+ *   in this txg or consecutive time).
+ * - A block may be cloned and freed within the same transaction group
+ *   (see dbuf_undirty()).
+ * - A block may be cloned and within the same transaction group the clone
+ *   can be cloned again (see dmu_read_l0_bps()).
+ * - A file might have been deleted, but the caller still has a file descriptor
+ *   open to this file and clones it.
+ *
+ * When we free a block we have an additional step in the ZIO pipeline where we
+ * call the zio_brt_free() function. We then call the brt_entry_decref()
+ * that loads the corresponding BRT entry (if one exists) and decreases
+ * reference counter. If this is not the last reference we will stop ZIO
+ * pipeline here. If this is the last reference or the block is not in the
+ * BRT, we continue the pipeline and free the block as usual.
+ *
+ * At the beginning of spa_sync() where there can be no more block cloning,
+ * but before issuing frees we call brt_pending_apply(). This function applies
+ * all the new clones to the BRT table - we load BRT entries and update
+ * reference counters. To sync new BRT entries to disk, we use brt_sync()
+ * function. This function will sync all dirty per-top-level-vdev BRTs,
+ * the entry counters arrays, etc.
+ *
+ * Block Cloning and ZIL.
+ *
+ * Every clone operation is divided into chunks (similar to write) and each
+ * chunk is cloned in a separate transaction. The chunk size is determined by
+ * how many BPs we can fit into a single ZIL entry.
+ * Replaying clone operation is different from the regular clone operation,
+ * as when we log clone operations we cannot use the source object - it may
+ * reside on a different dataset, so we log BPs we want to clone.
+ * The ZIL is replayed when we mount the given dataset, not when the pool is
+ * imported. Taking this into account it is possible that the pool is imported
+ * without mounting datasets and the source dataset is destroyed before the
+ * destination dataset is mounted and its ZIL replayed.
+ * To address this situation we leverage zil_claim() mechanism where ZFS will
+ * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
+ * entries, we will bump reference counters for their BPs in the BRT and then
+ * on mount and ZIL replay we will just attach BPs to the file without
+ * bumping reference counters.
+ * Note it is still possible that after zil_claim() we never mount the
+ * destination, so we never replay its ZIL and we destroy it. This way we would
+ * end up with leaked references in BRT. We address that too as ZFS gives us
+ * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ */
+
+/*
+ * BRT - Block Reference Table.
+ */
+#define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
+
+/*
+ * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
+ * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
+ * Each element in this array represents how many BRT entries do we have in this
+ * chunk of storage. We always load this entire array into memory and update as
+ * needed. By having it in memory we can quickly tell (during zio_free()) if
+ * there are any BRT entries that we might need to update.
+ *
+ * This value cannot be larger than 16MB, at least as long as we support
+ * 512 byte block sizes. With 512 byte block size we can have exactly
+ * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
+ * many for a 16bit counter.
+ */
+#define	BRT_RANGESIZE	(16 * 1024 * 1024)
+_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
+	"BRT_RANGESIZE is too large.");
+/*
+ * We don't want to update the whole structure every time. Maintain bitmap
+ * of dirty blocks within the regions, so that a single bit represents a
+ * block size of entcounts. For example if we have a 1PB vdev then all
+ * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
+ * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
+ * the whole 128MB on disk when we have updated only a single entcount.
+ * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
+ * is represented by a single bit. This gives us 4096 bits. A set bit in the
+ * bitmap means that we had a change in at least one of the 16384 entcounts
+ * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
+ */
+#define	BRT_BLOCKSIZE	(32 * 1024)
+#define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
+	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+
+#define	BRT_LITTLE_ENDIAN	0
+#define	BRT_BIG_ENDIAN		1
+#ifdef _ZFS_LITTLE_ENDIAN
+#define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
+#define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
+#else
+#define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
+#define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
+#endif
+
+typedef struct brt_vdev_phys {
+	uint64_t	bvp_mos_entries;
+	uint64_t	bvp_size;
+	uint64_t	bvp_byteorder;
+	uint64_t	bvp_totalcount;
+	uint64_t	bvp_rangesize;
+	uint64_t	bvp_usedspace;
+	uint64_t	bvp_savedspace;
+} brt_vdev_phys_t;
+
+typedef struct brt_vdev {
+	/*
+	 * VDEV id.
+	 */
+	uint64_t	bv_vdevid;
+	/*
+	 * Is the structure initiated?
+	 * (bv_entcount and bv_bitmap are allocated?)
+	 */
+	boolean_t	bv_initiated;
+	/*
+	 * Object number in the MOS for the entcount array and brt_vdev_phys.
+	 */
+	uint64_t	bv_mos_brtvdev;
+	/*
+	 * Object number in the MOS for the entries table.
+	 */
+	uint64_t	bv_mos_entries;
+	/*
+	 * Entries to sync.
+	 */
+	avl_tree_t	bv_tree;
+	/*
+	 * Does the bv_entcount[] array needs byte swapping?
+	 */
+	boolean_t	bv_need_byteswap;
+	/*
+	 * Number of entries in the bv_entcount[] array.
+	 */
+	uint64_t	bv_size;
+	/*
+	 * This is the array with BRT entry count per BRT_RANGESIZE.
+	 */
+	uint16_t	*bv_entcount;
+	/*
+	 * Sum of all bv_entcount[]s.
+	 */
+	uint64_t	bv_totalcount;
+	/*
+	 * Space on disk occupied by cloned blocks (without compression).
+	 */
+	uint64_t	bv_usedspace;
+	/*
+	 * How much additional space would be occupied without block cloning.
+	 */
+	uint64_t	bv_savedspace;
+	/*
+	 * brt_vdev_phys needs updating on disk.
+	 */
+	boolean_t	bv_meta_dirty;
+	/*
+	 * bv_entcount[] needs updating on disk.
+	 */
+	boolean_t	bv_entcount_dirty;
+	/*
+	 * bv_entcount[] potentially can be a bit too big to sychronize it all
+	 * when we just changed few entcounts. The fields below allow us to
+	 * track updates to bv_entcount[] array since the last sync.
+	 * A single bit in the bv_bitmap represents as many entcounts as can
+	 * fit into a single BRT_BLOCKSIZE.
+	 * For example we have 65536 entcounts in the bv_entcount array
+	 * (so the whole array is 128kB). We updated bv_entcount[2] and
+	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
+	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
+	 */
+	ulong_t		*bv_bitmap;
+	uint64_t	bv_nblocks;
+} brt_vdev_t;
+
+/*
+ * In-core brt
+ */
+typedef struct brt {
+	krwlock_t	brt_lock;
+	spa_t		*brt_spa;
+#define	brt_mos		brt_spa->spa_meta_objset
+	uint64_t	brt_rangesize;
+	uint64_t	brt_usedspace;
+	uint64_t	brt_savedspace;
+	avl_tree_t	brt_pending_tree[TXG_SIZE];
+	kmutex_t	brt_pending_lock[TXG_SIZE];
+	/* Sum of all entries across all bv_trees. */
+	uint64_t	brt_nentries;
+	brt_vdev_t	*brt_vdevs;
+	uint64_t	brt_nvdevs;
+} brt_t;
+
+/* Size of bre_offset / sizeof (uint64_t). */
+#define	BRT_KEY_WORDS	(1)
+
+/*
+ * In-core brt entry.
+ * On-disk we use bre_offset as the key and bre_refcount as the value.
+ */
+typedef struct brt_entry {
+	uint64_t	bre_offset;
+	uint64_t	bre_refcount;
+	avl_node_t	bre_node;
+} brt_entry_t;
+
+typedef struct brt_pending_entry {
+	blkptr_t	bpe_bp;
+	int		bpe_count;
+	avl_node_t	bpe_node;
+} brt_pending_entry_t;
+
+static kmem_cache_t *brt_entry_cache;
+static kmem_cache_t *brt_pending_entry_cache;
+
+/*
+ * Enable/disable prefetching of BRT entries that we are going to modify.
+ */
+int zfs_brt_prefetch = 1;
+
+#ifdef ZFS_DEBUG
+#define	BRT_DEBUG(...)	do {						\
+	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
+		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
+	}								\
+} while (0)
+#else
+#define	BRT_DEBUG(...)	do { } while (0)
+#endif
+
+int brt_zap_leaf_blockshift = 12;
+int brt_zap_indirect_blockshift = 12;
+
+static kstat_t	*brt_ksp;
+
+typedef struct brt_stats {
+	kstat_named_t brt_addref_entry_in_memory;
+	kstat_named_t brt_addref_entry_not_on_disk;
+	kstat_named_t brt_addref_entry_on_disk;
+	kstat_named_t brt_addref_entry_read_lost_race;
+	kstat_named_t brt_decref_entry_in_memory;
+	kstat_named_t brt_decref_entry_loaded_from_disk;
+	kstat_named_t brt_decref_entry_not_in_memory;
+	kstat_named_t brt_decref_entry_not_on_disk;
+	kstat_named_t brt_decref_entry_read_lost_race;
+	kstat_named_t brt_decref_entry_still_referenced;
+	kstat_named_t brt_decref_free_data_later;
+	kstat_named_t brt_decref_free_data_now;
+	kstat_named_t brt_decref_no_entry;
+} brt_stats_t;
+
+static brt_stats_t brt_stats = {
+	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
+	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
+	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
+	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
+	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
+	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
+	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
+	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
+	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
+	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
+	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
+	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
+	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
+};
+
+struct {
+	wmsum_t brt_addref_entry_in_memory;
+	wmsum_t brt_addref_entry_not_on_disk;
+	wmsum_t brt_addref_entry_on_disk;
+	wmsum_t brt_addref_entry_read_lost_race;
+	wmsum_t brt_decref_entry_in_memory;
+	wmsum_t brt_decref_entry_loaded_from_disk;
+	wmsum_t brt_decref_entry_not_in_memory;
+	wmsum_t brt_decref_entry_not_on_disk;
+	wmsum_t brt_decref_entry_read_lost_race;
+	wmsum_t brt_decref_entry_still_referenced;
+	wmsum_t brt_decref_free_data_later;
+	wmsum_t brt_decref_free_data_now;
+	wmsum_t brt_decref_no_entry;
+} brt_sums;
+
+#define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
+
+static int brt_entry_compare(const void *x1, const void *x2);
+static int brt_pending_entry_compare(const void *x1, const void *x2);
+
+static void
+brt_rlock(brt_t *brt)
+{
+	rw_enter(&brt->brt_lock, RW_READER);
+}
+
+static void
+brt_wlock(brt_t *brt)
+{
+	rw_enter(&brt->brt_lock, RW_WRITER);
+}
+
+static void
+brt_unlock(brt_t *brt)
+{
+	rw_exit(&brt->brt_lock);
+}
+
+static uint16_t
+brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
+{
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	if (brtvd->bv_need_byteswap) {
+		return (BSWAP_16(brtvd->bv_entcount[idx]));
+	} else {
+		return (brtvd->bv_entcount[idx]);
+	}
+}
+
+static void
+brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
+{
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	if (brtvd->bv_need_byteswap) {
+		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
+	} else {
+		brtvd->bv_entcount[idx] = entcnt;
+	}
+}
+
+static void
+brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
+{
+	uint16_t entcnt;
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	entcnt = brt_vdev_entcount_get(brtvd, idx);
+	ASSERT(entcnt < UINT16_MAX);
+
+	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
+}
+
+static void
+brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
+{
+	uint16_t entcnt;
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	entcnt = brt_vdev_entcount_get(brtvd, idx);
+	ASSERT(entcnt > 0);
+
+	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
+}
+
+#ifdef ZFS_DEBUG
+static void
+brt_vdev_dump(brt_t *brt)
+{
+	brt_vdev_t *brtvd;
+	uint64_t vdevid;
+
+	if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
+		return;
+	}
+
+	if (brt->brt_nvdevs == 0) {
+		zfs_dbgmsg("BRT empty");
+		return;
+	}
+
+	zfs_dbgmsg("BRT vdev dump:");
+	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		uint64_t idx;
+
+		brtvd = &brt->brt_vdevs[vdevid];
+		zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
+		    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+		    (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
+		    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
+		    (u_longlong_t)brtvd->bv_size,
+		    (u_longlong_t)brtvd->bv_totalcount,
+		    (u_longlong_t)brtvd->bv_nblocks,
+		    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+		if (brtvd->bv_totalcount > 0) {
+			zfs_dbgmsg("    entcounts:");
+			for (idx = 0; idx < brtvd->bv_size; idx++) {
+				if (brt_vdev_entcount_get(brtvd, idx) > 0) {
+					zfs_dbgmsg("      [%04llu] %hu",
+					    (u_longlong_t)idx,
+					    brt_vdev_entcount_get(brtvd, idx));
+				}
+			}
+		}
+		if (brtvd->bv_entcount_dirty) {
+			char *bitmap;
+
+			bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
+			for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+				bitmap[idx] =
+				    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
+			}
+			bitmap[idx] = '\0';
+			zfs_dbgmsg("    bitmap: %s", bitmap);
+			kmem_free(bitmap, brtvd->bv_nblocks + 1);
+		}
+	}
+}
+#endif
+
+static brt_vdev_t *
+brt_vdev(brt_t *brt, uint64_t vdevid)
+{
+	brt_vdev_t *brtvd;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+	if (vdevid < brt->brt_nvdevs) {
+		brtvd = &brt->brt_vdevs[vdevid];
+	} else {
+		brtvd = NULL;
+	}
+
+	return (brtvd);
+}
+
+static void
+brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+	char name[64];
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT0(brtvd->bv_mos_brtvdev);
+	ASSERT0(brtvd->bv_mos_entries);
+	ASSERT(brtvd->bv_entcount != NULL);
+	ASSERT(brtvd->bv_size > 0);
+	ASSERT(brtvd->bv_bitmap != NULL);
+	ASSERT(brtvd->bv_nblocks > 0);
+
+	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
+	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
+	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
+	    0, tx);
+	VERIFY(brtvd->bv_mos_entries != 0);
+	BRT_DEBUG("MOS entries created, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_entries);
+
+	/*
+	 * We allocate DMU buffer to store the bv_entcount[] array.
+	 * We will keep array size (bv_size) and cummulative count for all
+	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
+	 */
+	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
+	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
+	VERIFY(brtvd->bv_mos_brtvdev != 0);
+	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_brtvdev);
+
+	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+	    (u_longlong_t)brtvd->bv_vdevid);
+	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
+	BRT_DEBUG("Pool directory object created, object=%s", name);
+
+	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+	vdev_t *vd;
+	uint16_t *entcount;
+	ulong_t *bitmap;
+	uint64_t nblocks, size;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+
+	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
+	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
+	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
+	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
+
+	entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
+	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
+	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
+
+	if (!brtvd->bv_initiated) {
+		ASSERT0(brtvd->bv_size);
+		ASSERT(brtvd->bv_entcount == NULL);
+		ASSERT(brtvd->bv_bitmap == NULL);
+		ASSERT0(brtvd->bv_nblocks);
+
+		avl_create(&brtvd->bv_tree, brt_entry_compare,
+		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
+	} else {
+		ASSERT(brtvd->bv_size > 0);
+		ASSERT(brtvd->bv_entcount != NULL);
+		ASSERT(brtvd->bv_bitmap != NULL);
+		ASSERT(brtvd->bv_nblocks > 0);
+		/*
+		 * TODO: Allow vdev shrinking. We only need to implement
+		 * shrinking the on-disk BRT VDEV object.
+		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
+		 *     size, tx);
+		 */
+		ASSERT3U(brtvd->bv_size, <=, size);
+
+		memcpy(entcount, brtvd->bv_entcount,
+		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
+		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
+		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
+		kmem_free(brtvd->bv_entcount,
+		    sizeof (entcount[0]) * brtvd->bv_size);
+		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+	}
+
+	brtvd->bv_size = size;
+	brtvd->bv_entcount = entcount;
+	brtvd->bv_bitmap = bitmap;
+	brtvd->bv_nblocks = nblocks;
+	if (!brtvd->bv_initiated) {
+		brtvd->bv_need_byteswap = FALSE;
+		brtvd->bv_initiated = TRUE;
+		BRT_DEBUG("BRT VDEV %llu initiated.",
+		    (u_longlong_t)brtvd->bv_vdevid);
+	}
+}
+
+static void
+brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
+{
+	char name[64];
+	dmu_buf_t *db;
+	brt_vdev_phys_t *bvphys;
+	int error;
+
+	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+	    (u_longlong_t)brtvd->bv_vdevid);
+	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
+	if (error != 0)
+		return;
+	ASSERT(brtvd->bv_mos_brtvdev != 0);
+
+	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
+	ASSERT0(error);
+	if (error != 0)
+		return;
+
+	bvphys = db->db_data;
+	if (brt->brt_rangesize == 0) {
+		brt->brt_rangesize = bvphys->bvp_rangesize;
+	} else {
+		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
+	}
+
+	ASSERT(!brtvd->bv_initiated);
+	brt_vdev_realloc(brt, brtvd);
+
+	/* TODO: We don't support VDEV shrinking. */
+	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
+
+	/*
+	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
+	 */
+	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
+	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
+	ASSERT0(error);
+
+	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
+	ASSERT(brtvd->bv_mos_entries != 0);
+	brtvd->bv_need_byteswap =
+	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
+	brtvd->bv_totalcount = bvphys->bvp_totalcount;
+	brtvd->bv_usedspace = bvphys->bvp_usedspace;
+	brtvd->bv_savedspace = bvphys->bvp_savedspace;
+	brt->brt_usedspace += brtvd->bv_usedspace;
+	brt->brt_savedspace += brtvd->bv_savedspace;
+
+	dmu_buf_rele(db, FTAG);
+
+	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
+	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
+	    (u_longlong_t)brtvd->bv_mos_entries);
+}
+
+static void
+brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_initiated);
+
+	kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
+	brtvd->bv_entcount = NULL;
+	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+	brtvd->bv_bitmap = NULL;
+	ASSERT0(avl_numnodes(&brtvd->bv_tree));
+	avl_destroy(&brtvd->bv_tree);
+
+	brtvd->bv_size = 0;
+	brtvd->bv_nblocks = 0;
+
+	brtvd->bv_initiated = FALSE;
+	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
+}
+
+static void
+brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+	char name[64];
+	uint64_t count;
+	dmu_buf_t *db;
+	brt_vdev_phys_t *bvphys;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_mos_brtvdev != 0);
+	ASSERT(brtvd->bv_mos_entries != 0);
+
+	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
+	VERIFY0(count);
+	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
+	BRT_DEBUG("MOS entries destroyed, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_entries);
+	brtvd->bv_mos_entries = 0;
+
+	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+	bvphys = db->db_data;
+	ASSERT0(bvphys->bvp_totalcount);
+	ASSERT0(bvphys->bvp_usedspace);
+	ASSERT0(bvphys->bvp_savedspace);
+	dmu_buf_rele(db, FTAG);
+
+	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
+	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_brtvdev);
+	brtvd->bv_mos_brtvdev = 0;
+
+	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+	    (u_longlong_t)brtvd->bv_vdevid);
+	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	BRT_DEBUG("Pool directory object removed, object=%s", name);
+
+	brt_vdev_dealloc(brt, brtvd);
+
+	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
+{
+	brt_vdev_t *brtvd, *vdevs;
+	uint64_t vdevid;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
+
+	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
+	if (brt->brt_nvdevs > 0) {
+		ASSERT(brt->brt_vdevs != NULL);
+
+		memcpy(vdevs, brt->brt_vdevs,
+		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
+		kmem_free(brt->brt_vdevs,
+		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
+	}
+	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
+		brtvd = &vdevs[vdevid];
+
+		brtvd->bv_vdevid = vdevid;
+		brtvd->bv_initiated = FALSE;
+	}
+
+	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
+	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
+
+	brt->brt_vdevs = vdevs;
+	brt->brt_nvdevs = nvdevs;
+}
+
+static boolean_t
+brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
+{
+	uint64_t idx;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+	idx = bre->bre_offset / brt->brt_rangesize;
+	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
+		/* VDEV wasn't expanded. */
+		return (brt_vdev_entcount_get(brtvd, idx) > 0);
+	}
+
+	return (FALSE);
+}
+
+static void
+brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+    uint64_t dsize)
+{
+	uint64_t idx;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+	ASSERT(brtvd != NULL);
+	ASSERT(brtvd->bv_entcount != NULL);
+
+	brt->brt_savedspace += dsize;
+	brtvd->bv_savedspace += dsize;
+	brtvd->bv_meta_dirty = TRUE;
+
+	if (bre->bre_refcount > 1) {
+		return;
+	}
+
+	brt->brt_usedspace += dsize;
+	brtvd->bv_usedspace += dsize;
+
+	idx = bre->bre_offset / brt->brt_rangesize;
+	if (idx >= brtvd->bv_size) {
+		/* VDEV has been expanded. */
+		brt_vdev_realloc(brt, brtvd);
+	}
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	brtvd->bv_totalcount++;
+	brt_vdev_entcount_inc(brtvd, idx);
+	brtvd->bv_entcount_dirty = TRUE;
+	idx = idx / BRT_BLOCKSIZE / 8;
+	BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+	brt_vdev_dump(brt);
+#endif
+}
+
+static void
+brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+    uint64_t dsize)
+{
+	uint64_t idx;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd != NULL);
+	ASSERT(brtvd->bv_entcount != NULL);
+
+	brt->brt_savedspace -= dsize;
+	brtvd->bv_savedspace -= dsize;
+	brtvd->bv_meta_dirty = TRUE;
+
+	if (bre->bre_refcount > 0) {
+		return;
+	}
+
+	brt->brt_usedspace -= dsize;
+	brtvd->bv_usedspace -= dsize;
+
+	idx = bre->bre_offset / brt->brt_rangesize;
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	ASSERT(brtvd->bv_totalcount > 0);
+	brtvd->bv_totalcount--;
+	brt_vdev_entcount_dec(brtvd, idx);
+	brtvd->bv_entcount_dirty = TRUE;
+	idx = idx / BRT_BLOCKSIZE / 8;
+	BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+	brt_vdev_dump(brt);
+#endif
+}
+
+static void
+brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	brt_vdev_phys_t *bvphys;
+
+	ASSERT(brtvd->bv_meta_dirty);
+	ASSERT(brtvd->bv_mos_brtvdev != 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+
+	if (brtvd->bv_entcount_dirty) {
+		/*
+		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
+		 */
+		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
+		    brtvd->bv_entcount, tx);
+		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
+		brtvd->bv_entcount_dirty = FALSE;
+	}
+
+	dmu_buf_will_dirty(db, tx);
+	bvphys = db->db_data;
+	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
+	bvphys->bvp_size = brtvd->bv_size;
+	if (brtvd->bv_need_byteswap) {
+		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
+	} else {
+		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
+	}
+	bvphys->bvp_totalcount = brtvd->bv_totalcount;
+	bvphys->bvp_rangesize = brt->brt_rangesize;
+	bvphys->bvp_usedspace = brtvd->bv_usedspace;
+	bvphys->bvp_savedspace = brtvd->bv_savedspace;
+	dmu_buf_rele(db, FTAG);
+
+	brtvd->bv_meta_dirty = FALSE;
+}
+
+static void
+brt_vdevs_alloc(brt_t *brt, boolean_t load)
+{
+	brt_vdev_t *brtvd;
+	uint64_t vdevid;
+
+	brt_wlock(brt);
+
+	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
+
+	if (load) {
+		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+			brtvd = &brt->brt_vdevs[vdevid];
+			ASSERT(brtvd->bv_entcount == NULL);
+
+			brt_vdev_load(brt, brtvd);
+		}
+	}
+
+	if (brt->brt_rangesize == 0) {
+		brt->brt_rangesize = BRT_RANGESIZE;
+	}
+
+	brt_unlock(brt);
+}
+
+static void
+brt_vdevs_free(brt_t *brt)
+{
+	brt_vdev_t *brtvd;
+	uint64_t vdevid;
+
+	brt_wlock(brt);
+
+	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brtvd = &brt->brt_vdevs[vdevid];
+		if (brtvd->bv_initiated)
+			brt_vdev_dealloc(brt, brtvd);
+	}
+	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
+
+	brt_unlock(brt);
+}
+
+static void
+brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
+{
+
+	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+	bre->bre_refcount = 0;
+
+	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
+}
+
+static int
+brt_entry_compare(const void *x1, const void *x2)
+{
+	const brt_entry_t *bre1 = x1;
+	const brt_entry_t *bre2 = x2;
+
+	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
+}
+
+static int
+brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
+{
+	uint64_t mos_entries;
+	uint64_t one, physsize;
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+	if (!brt_vdev_lookup(brt, brtvd, bre))
+		return (SET_ERROR(ENOENT));
+
+	/*
+	 * Remember mos_entries object number. After we reacquire the BRT lock,
+	 * the brtvd pointer may be invalid.
+	 */
+	mos_entries = brtvd->bv_mos_entries;
+	if (mos_entries == 0)
+		return (SET_ERROR(ENOENT));
+
+	brt_unlock(brt);
+
+	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+	    BRT_KEY_WORDS, &one, &physsize);
+	if (error == 0) {
+		ASSERT3U(one, ==, 1);
+		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
+
+		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
+		    &bre->bre_offset, BRT_KEY_WORDS, 1,
+		    sizeof (bre->bre_refcount), &bre->bre_refcount);
+		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
+		    "count=%llu error=%d", (u_longlong_t)mos_entries,
+		    (u_longlong_t)brtvd->bv_vdevid,
+		    (u_longlong_t)bre->bre_offset,
+		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
+	}
+
+	brt_wlock(brt);
+
+	return (error);
+}
+
+static void
+brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
+{
+	brt_vdev_t *brtvd;
+	uint64_t mos_entries = 0;
+
+	brt_rlock(brt);
+	brtvd = brt_vdev(brt, vdevid);
+	if (brtvd != NULL)
+		mos_entries = brtvd->bv_mos_entries;
+	brt_unlock(brt);
+
+	if (mos_entries == 0)
+		return;
+
+	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
+	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
+	    (u_longlong_t)bre->bre_offset);
+	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
+	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
+}
+
+static int
+brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_mos_entries != 0);
+	ASSERT(bre->bre_refcount > 0);
+
+	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
+	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
+	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
+	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
+	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
+	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
+	    (u_longlong_t)bre->bre_refcount, error);
+
+	return (error);
+}
+
+static int
+brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_mos_entries != 0);
+	ASSERT0(bre->bre_refcount);
+
+	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
+	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
+	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
+	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
+	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
+	    (u_longlong_t)bre->bre_refcount, error);
+
+	return (error);
+}
+
+/*
+ * Return TRUE if we _can_ have BRT entry for this bp. It might be false
+ * positive, but gives us quick answer if we should look into BRT, which
+ * may require reads and thus will be more expensive.
+ */
+boolean_t
+brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t bre_search;
+	boolean_t mayexists = FALSE;
+	uint64_t vdevid;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_rlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	if (brtvd != NULL && brtvd->bv_initiated) {
+		if (!avl_is_empty(&brtvd->bv_tree) ||
+		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
+			mayexists = TRUE;
+		}
+	}
+
+	brt_unlock(brt);
+
+	return (mayexists);
+}
+
+uint64_t
+brt_get_dspace(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return (0);
+
+	return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_used(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return (0);
+
+	return (brt->brt_usedspace);
+}
+
+uint64_t
+brt_get_saved(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return (0);
+
+	return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_ratio(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt->brt_usedspace == 0)
+		return (100);
+
+	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
+	    brt->brt_usedspace);
+}
+
+static int
+brt_kstats_update(kstat_t *ksp, int rw)
+{
+	brt_stats_t *bs = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	bs->brt_addref_entry_in_memory.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
+	bs->brt_addref_entry_not_on_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
+	bs->brt_addref_entry_on_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
+	bs->brt_addref_entry_read_lost_race.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
+	bs->brt_decref_entry_in_memory.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
+	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
+	bs->brt_decref_entry_not_in_memory.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
+	bs->brt_decref_entry_not_on_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
+	bs->brt_decref_entry_read_lost_race.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
+	bs->brt_decref_entry_still_referenced.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
+	bs->brt_decref_free_data_later.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_free_data_later);
+	bs->brt_decref_free_data_now.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_free_data_now);
+	bs->brt_decref_no_entry.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_no_entry);
+
+	return (0);
+}
+
+static void
+brt_stat_init(void)
+{
+
+	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
+	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
+	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
+	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
+	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
+	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
+	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
+
+	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (brt_ksp != NULL) {
+		brt_ksp->ks_data = &brt_stats;
+		brt_ksp->ks_update = brt_kstats_update;
+		kstat_install(brt_ksp);
+	}
+}
+
+static void
+brt_stat_fini(void)
+{
+	if (brt_ksp != NULL) {
+		kstat_delete(brt_ksp);
+		brt_ksp = NULL;
+	}
+
+	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
+	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
+	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
+	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
+	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
+	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
+	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
+	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
+	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
+	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
+	wmsum_fini(&brt_sums.brt_decref_free_data_later);
+	wmsum_fini(&brt_sums.brt_decref_free_data_now);
+	wmsum_fini(&brt_sums.brt_decref_no_entry);
+}
+
+void
+brt_init(void)
+{
+	brt_entry_cache = kmem_cache_create("brt_entry_cache",
+	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
+	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	brt_stat_init();
+}
+
+void
+brt_fini(void)
+{
+	brt_stat_fini();
+
+	kmem_cache_destroy(brt_entry_cache);
+	kmem_cache_destroy(brt_pending_entry_cache);
+}
+
+static brt_entry_t *
+brt_entry_alloc(const brt_entry_t *bre_init)
+{
+	brt_entry_t *bre;
+
+	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
+	bre->bre_offset = bre_init->bre_offset;
+	bre->bre_refcount = bre_init->bre_refcount;
+
+	return (bre);
+}
+
+static void
+brt_entry_free(brt_entry_t *bre)
+{
+
+	kmem_cache_free(brt_entry_cache, bre);
+}
+
+static void
+brt_entry_addref(brt_t *brt, const blkptr_t *bp)
+{
+	brt_vdev_t *brtvd;
+	brt_entry_t *bre, *racebre;
+	brt_entry_t bre_search;
+	avl_index_t where;
+	uint64_t vdevid;
+	int error;
+
+	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_wlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	if (brtvd == NULL) {
+		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
+
+		/* New VDEV was added. */
+		brt_vdevs_expand(brt, vdevid + 1);
+		brtvd = brt_vdev(brt, vdevid);
+	}
+	ASSERT(brtvd != NULL);
+	if (!brtvd->bv_initiated)
+		brt_vdev_realloc(brt, brtvd);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre != NULL) {
+		BRTSTAT_BUMP(brt_addref_entry_in_memory);
+	} else {
+		/*
+		 * brt_entry_lookup() may drop the BRT (read) lock and
+		 * reacquire it (write).
+		 */
+		error = brt_entry_lookup(brt, brtvd, &bre_search);
+		/* bre_search now contains correct bre_refcount */
+		ASSERT(error == 0 || error == ENOENT);
+		if (error == 0)
+			BRTSTAT_BUMP(brt_addref_entry_on_disk);
+		else
+			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
+		/*
+		 * When the BRT lock was dropped, brt_vdevs[] may have been
+		 * expanded and reallocated, we need to update brtvd's pointer.
+		 */
+		brtvd = brt_vdev(brt, vdevid);
+		ASSERT(brtvd != NULL);
+
+		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+		if (racebre == NULL) {
+			bre = brt_entry_alloc(&bre_search);
+			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+			avl_insert(&brtvd->bv_tree, bre, where);
+			brt->brt_nentries++;
+		} else {
+			/*
+			 * The entry was added when the BRT lock was dropped in
+			 * brt_entry_lookup().
+			 */
+			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
+			bre = racebre;
+		}
+	}
+	bre->bre_refcount++;
+	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+	brt_unlock(brt);
+}
+
+/* Return TRUE if block should be freed immediately. */
+boolean_t
+brt_entry_decref(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t *bre, *racebre;
+	brt_entry_t bre_search;
+	avl_index_t where;
+	uint64_t vdevid;
+	int error;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_wlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre != NULL) {
+		BRTSTAT_BUMP(brt_decref_entry_in_memory);
+		goto out;
+	} else {
+		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
+	}
+
+	/*
+	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
+	 */
+	error = brt_entry_lookup(brt, brtvd, &bre_search);
+	/* bre_search now contains correct bre_refcount */
+	ASSERT(error == 0 || error == ENOENT);
+	/*
+	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
+	 * and reallocated, we need to update brtvd's pointer.
+	 */
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	if (error == ENOENT) {
+		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
+		bre = NULL;
+		goto out;
+	}
+
+	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+	if (racebre != NULL) {
+		/*
+		 * The entry was added when the BRT lock was dropped in
+		 * brt_entry_lookup().
+		 */
+		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
+		bre = racebre;
+		goto out;
+	}
+
+	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
+	bre = brt_entry_alloc(&bre_search);
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	avl_insert(&brtvd->bv_tree, bre, where);
+	brt->brt_nentries++;
+
+out:
+	if (bre == NULL) {
+		/*
+		 * This is a free of a regular (not cloned) block.
+		 */
+		brt_unlock(brt);
+		BRTSTAT_BUMP(brt_decref_no_entry);
+		return (B_TRUE);
+	}
+	if (bre->bre_refcount == 0) {
+		brt_unlock(brt);
+		BRTSTAT_BUMP(brt_decref_free_data_now);
+		return (B_TRUE);
+	}
+
+	ASSERT(bre->bre_refcount > 0);
+	bre->bre_refcount--;
+	if (bre->bre_refcount == 0)
+		BRTSTAT_BUMP(brt_decref_free_data_later);
+	else
+		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
+	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+	brt_unlock(brt);
+
+	return (B_FALSE);
+}
+
+static void
+brt_prefetch(brt_t *brt, const blkptr_t *bp)
+{
+	brt_entry_t bre;
+	uint64_t vdevid;
+
+	ASSERT(bp != NULL);
+
+	if (!zfs_brt_prefetch)
+		return;
+
+	brt_entry_fill(bp, &bre, &vdevid);
+
+	brt_entry_prefetch(brt, vdevid, &bre);
+}
+
+static int
+brt_pending_entry_compare(const void *x1, const void *x2)
+{
+	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
+	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
+	int cmp;
+
+	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+	if (cmp == 0) {
+		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+		    DVA_GET_VDEV(&bp2->blk_dva[0]));
+		if (cmp == 0) {
+			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		}
+	}
+
+	return (cmp);
+}
+
+void
+brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	brt_t *brt;
+	avl_tree_t *pending_tree;
+	kmutex_t *pending_lock;
+	brt_pending_entry_t *bpe, *newbpe;
+	avl_index_t where;
+	uint64_t txg;
+
+	brt = spa->spa_brt;
+	txg = dmu_tx_get_txg(tx);
+	ASSERT3U(txg, !=, 0);
+	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
+	newbpe->bpe_bp = *bp;
+	newbpe->bpe_count = 1;
+
+	mutex_enter(pending_lock);
+
+	bpe = avl_find(pending_tree, newbpe, &where);
+	if (bpe == NULL) {
+		avl_insert(pending_tree, newbpe, where);
+		newbpe = NULL;
+	} else {
+		bpe->bpe_count++;
+	}
+
+	mutex_exit(pending_lock);
+
+	if (newbpe != NULL) {
+		ASSERT(bpe != NULL);
+		ASSERT(bpe != newbpe);
+		kmem_cache_free(brt_pending_entry_cache, newbpe);
+	} else {
+		ASSERT(bpe == NULL);
+	}
+
+	/* Prefetch BRT entry, as we will need it in the syncing context. */
+	brt_prefetch(brt, bp);
+}
+
+void
+brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	brt_t *brt;
+	avl_tree_t *pending_tree;
+	kmutex_t *pending_lock;
+	brt_pending_entry_t *bpe, bpe_search;
+	uint64_t txg;
+
+	brt = spa->spa_brt;
+	txg = dmu_tx_get_txg(tx);
+	ASSERT3U(txg, !=, 0);
+	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+	bpe_search.bpe_bp = *bp;
+
+	mutex_enter(pending_lock);
+
+	bpe = avl_find(pending_tree, &bpe_search, NULL);
+	/* I believe we should always find bpe when this function is called. */
+	if (bpe != NULL) {
+		ASSERT(bpe->bpe_count > 0);
+
+		bpe->bpe_count--;
+		if (bpe->bpe_count == 0) {
+			avl_remove(pending_tree, bpe);
+			kmem_cache_free(brt_pending_entry_cache, bpe);
+		}
+	}
+
+	mutex_exit(pending_lock);
+}
+
+void
+brt_pending_apply(spa_t *spa, uint64_t txg)
+{
+	brt_t *brt;
+	brt_pending_entry_t *bpe;
+	avl_tree_t *pending_tree;
+	kmutex_t *pending_lock;
+	void *c;
+
+	ASSERT3U(txg, !=, 0);
+
+	brt = spa->spa_brt;
+	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+	mutex_enter(pending_lock);
+
+	c = NULL;
+	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
+		boolean_t added_to_ddt;
+
+		mutex_exit(pending_lock);
+
+		for (int i = 0; i < bpe->bpe_count; i++) {
+			/*
+			 * If the block has DEDUP bit set, it means that it
+			 * already exists in the DEDUP table, so we can just
+			 * use that instead of creating new entry in
+			 * the BRT table.
+			 */
+			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
+				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
+			} else {
+				added_to_ddt = B_FALSE;
+			}
+			if (!added_to_ddt)
+				brt_entry_addref(brt, &bpe->bpe_bp);
+		}
+
+		kmem_cache_free(brt_pending_entry_cache, bpe);
+		mutex_enter(pending_lock);
+	}
+
+	mutex_exit(pending_lock);
+}
+
+static void
+brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_mos_entries != 0);
+
+	if (bre->bre_refcount == 0) {
+		int error;
+
+		error = brt_entry_remove(brt, brtvd, bre, tx);
+		ASSERT(error == 0 || error == ENOENT);
+		/*
+		 * If error == ENOENT then zfs_clone_range() was done from a
+		 * removed (but opened) file (open(), unlink()).
+		 */
+		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+	} else {
+		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+	}
+}
+
+static void
+brt_sync_table(brt_t *brt, dmu_tx_t *tx)
+{
+	brt_vdev_t *brtvd;
+	brt_entry_t *bre;
+	uint64_t vdevid;
+	void *c;
+
+	brt_wlock(brt);
+
+	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brtvd = &brt->brt_vdevs[vdevid];
+
+		if (!brtvd->bv_initiated)
+			continue;
+
+		if (!brtvd->bv_meta_dirty) {
+			ASSERT(!brtvd->bv_entcount_dirty);
+			ASSERT0(avl_numnodes(&brtvd->bv_tree));
+			continue;
+		}
+
+		ASSERT(!brtvd->bv_entcount_dirty ||
+		    avl_numnodes(&brtvd->bv_tree) != 0);
+
+		if (brtvd->bv_mos_brtvdev == 0)
+			brt_vdev_create(brt, brtvd, tx);
+
+		c = NULL;
+		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
+			brt_sync_entry(brt, brtvd, bre, tx);
+			brt_entry_free(bre);
+			ASSERT(brt->brt_nentries > 0);
+			brt->brt_nentries--;
+		}
+
+		brt_vdev_sync(brt, brtvd, tx);
+
+		if (brtvd->bv_totalcount == 0)
+			brt_vdev_destroy(brt, brtvd, tx);
+	}
+
+	ASSERT0(brt->brt_nentries);
+
+	brt_unlock(brt);
+}
+
+void
+brt_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	brt_t *brt;
+
+	ASSERT(spa_syncing_txg(spa) == txg);
+
+	brt = spa->spa_brt;
+	brt_rlock(brt);
+	if (brt->brt_nentries == 0) {
+		/* No changes. */
+		brt_unlock(brt);
+		return;
+	}
+	brt_unlock(brt);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	brt_sync_table(brt, tx);
+
+	dmu_tx_commit(tx);
+}
+
+static void
+brt_table_alloc(brt_t *brt)
+{
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		avl_create(&brt->brt_pending_tree[i],
+		    brt_pending_entry_compare,
+		    sizeof (brt_pending_entry_t),
+		    offsetof(brt_pending_entry_t, bpe_node));
+		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
+}
+
+static void
+brt_table_free(brt_t *brt)
+{
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
+
+		avl_destroy(&brt->brt_pending_tree[i]);
+		mutex_destroy(&brt->brt_pending_lock[i]);
+	}
+}
+
+static void
+brt_alloc(spa_t *spa)
+{
+	brt_t *brt;
+
+	ASSERT(spa->spa_brt == NULL);
+
+	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
+	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
+	brt->brt_spa = spa;
+	brt->brt_rangesize = 0;
+	brt->brt_nentries = 0;
+	brt->brt_vdevs = NULL;
+	brt->brt_nvdevs = 0;
+	brt_table_alloc(brt);
+
+	spa->spa_brt = brt;
+}
+
+void
+brt_create(spa_t *spa)
+{
+
+	brt_alloc(spa);
+	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
+}
+
+int
+brt_load(spa_t *spa)
+{
+
+	brt_alloc(spa);
+	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
+
+	return (0);
+}
+
+void
+brt_unload(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return;
+
+	brt_vdevs_free(brt);
+	brt_table_free(brt);
+	rw_destroy(&brt->brt_lock);
+	kmem_free(brt, sizeof (*brt));
+	spa->spa_brt = NULL;
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
+    "Enable prefetching of BRT entries");
+#ifdef ZFS_BRT_DEBUG
+ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
+#endif
+/* END CSTYLED */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 191e5e043..94c2ae9d7 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -26,6 +26,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
@@ -49,6 +50,7 @@
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
@@ -1427,7 +1429,7 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 }
 
 static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
 	blkptr_t *bps = db->db.db_data;
 	uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1436,12 +1438,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
 	for (int i = 0; i < n_bps; i++) {
 		blkptr_t *bp = &bps[i];
 
-		ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
-		BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
-		    dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
-		BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-		BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
-		BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+		ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+		BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+		BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
 	}
 }
 
@@ -1451,30 +1453,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
  * was taken, ENOENT if no action was taken.
  */
 static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
-	int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+	int is_hole = bp == NULL || BP_IS_HOLE(bp);
 	/*
 	 * For level 0 blocks only, if the above check fails:
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
-	if (!is_hole && db->db_level == 0) {
-		is_hole = dnode_block_freed(dn, db->db_blkid) ||
-		    BP_IS_HOLE(db->db_blkptr);
-	}
+	if (!is_hole && db->db_level == 0)
+		is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
 	if (is_hole) {
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		memset(db->db.db_data, 0, db->db.db_size);
 
-		if (db->db_blkptr != NULL && db->db_level > 0 &&
-		    BP_IS_HOLE(db->db_blkptr) &&
-		    db->db_blkptr->blk_birth != 0) {
-			dbuf_handle_indirect_hole(db, dn);
+		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+		    bp->blk_birth != 0) {
+			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1551,12 +1550,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
+	blkptr_t bp, *bpp;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_state == DB_UNCACHED);
+	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_parent == NULL ||
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1566,16 +1566,46 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}
 
-	err = dbuf_read_hole(db, dn);
+	if (db->db_state == DB_UNCACHED) {
+		if (db->db_blkptr == NULL) {
+			bpp = NULL;
+		} else {
+			bp = *db->db_blkptr;
+			bpp = &bp;
+		}
+	} else {
+		struct dirty_leaf *dl;
+		dbuf_dirty_record_t *dr;
+
+		ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+		dr = list_head(&db->db_dirty_records);
+		if (dr == NULL) {
+			err = EIO;
+			goto early_unlock;
+		} else {
+			dl = &dr->dt.dl;
+			if (!dl->dr_brtwrite) {
+				err = EIO;
+				goto early_unlock;
+			}
+			bp = dl->dr_overridden_by;
+			bpp = &bp;
+		}
+	}
+
+	err = dbuf_read_hole(db, dn, bpp);
 	if (err == 0)
 		goto early_unlock;
 
+	ASSERT(bpp != NULL);
+
 	/*
 	 * Any attempt to read a redacted block should result in an error. This
 	 * will never happen under normal conditions, but can be useful for
 	 * debugging purposes.
 	 */
-	if (BP_IS_REDACTED(db->db_blkptr)) {
+	if (BP_IS_REDACTED(bpp)) {
 		ASSERT(dsl_dataset_feature_is_active(
 		    db->db_objset->os_dsl_dataset,
 		    SPA_FEATURE_REDACTED_DATASETS));
@@ -1590,7 +1620,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
-	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
 		spa_log_error(db->db_objset->os_spa, &zb);
 		zfs_panic_recover("unencrypted block in encrypted "
 		    "object set %llu", dmu_objset_id(db->db_objset));
@@ -1621,15 +1651,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
 		zio_flags |= ZIO_FLAG_RAW;
 	/*
-	 * The zio layer will copy the provided blkptr later, but we need to
-	 * do this now so that we can release the parent's rwlock. We have to
-	 * do that now so that if dbuf_read_done is called synchronously (on
+	 * The zio layer will copy the provided blkptr later, but we have our
+	 * own copy so that we can release the parent's rwlock. We have to
+	 * do that so that if dbuf_read_done is called synchronously (on
 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
-	blkptr_t bp = *db->db_blkptr;
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, &bp,
+	(void) arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
 	    &aflags, &zb);
 	return (err);
@@ -1731,9 +1760,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	 */
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
-	if (db->db_state == DB_NOFILL)
-		return (SET_ERROR(EIO));
-
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
@@ -1780,13 +1806,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		}
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_hits);
-	} else if (db->db_state == DB_UNCACHED) {
+	} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
 		boolean_t need_wait = B_FALSE;
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-		if (zio == NULL &&
-		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+		if (zio == NULL && (db->db_state == DB_NOFILL ||
+		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
@@ -1913,7 +1939,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
-	arc_release(dr->dt.dl.dr_data, db);
+	if (!dr->dt.dl.dr_brtwrite)
+		arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
@@ -1996,6 +2023,11 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
+				if (dr->dt.dl.dr_brtwrite) {
+					ASSERT(db->db.db_data == NULL);
+					mutex_exit(&db->db_mtx);
+					continue;
+				}
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
@@ -2285,7 +2317,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-	if (db->db_blkid != DMU_BONUS_BLKID) {
+	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
@@ -2328,8 +2360,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
-	if (db->db_blkid != DMU_BONUS_BLKID)
+	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dr->dr_accounted = db->db.db_size;
+	}
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2489,6 +2522,7 @@ static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
+	boolean_t brtwrite;
 
 	ASSERT(txg != 0);
 
@@ -2513,6 +2547,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		return (B_FALSE);
 	ASSERT(dr->dr_dbuf == db);
 
+	brtwrite = dr->dt.dl.dr_brtwrite;
+	if (brtwrite) {
+		/*
+		 * We are freeing a block that we cloned in the same
+		 * transaction group.
+		 */
+		brt_pending_remove(dmu_objset_spa(db->db_objset),
+		    &dr->dt.dl.dr_overridden_by, tx);
+	}
+
 	dnode_t *dn = dr->dr_dnode;
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2542,7 +2586,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 	}
 
-	if (db->db_state != DB_NOFILL) {
+	if (db->db_state != DB_NOFILL && !brtwrite) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
@@ -2557,7 +2601,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	db->db_dirtycnt -= 1;
 
 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+		ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+		    arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
 	}
@@ -4748,8 +4793,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != db->db_buf)
+			if (dr->dt.dl.dr_data != NULL &&
+			    dr->dt.dl.dr_data != db->db_buf) {
 				arc_buf_destroy(dr->dt.dl.dr_data, db);
+			}
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -5046,7 +5093,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 7880a899a..33fea0ba3 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
@@ -1180,5 +1181,59 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 	return (SET_ERROR(ENOENT));
 }
 
+/*
+ * This function is used by Block Cloning (brt.c) to increase reference
+ * counter for the DDT entry if the block is already in DDT.
+ *
+ * Return false if the block, despite having the D bit set, is not present
+ * in the DDT. Currently this is not possible but might be in the future.
+ * See the comment below.
+ */
+boolean_t
+ddt_addref(spa_t *spa, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t *dde;
+	boolean_t result;
+
+	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+	ddt = ddt_select(spa, bp);
+	ddt_enter(ddt);
+
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ASSERT(dde != NULL);
+
+	if (dde->dde_type < DDT_TYPES) {
+		ddt_phys_t *ddp;
+
+		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
+
+		ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+		if (ddp->ddp_refcnt == 0) {
+			/* This should never happen? */
+			ddt_phys_fill(ddp, bp);
+		}
+		ddt_phys_addref(ddp);
+		result = B_TRUE;
+	} else {
+		/*
+		 * At the time of implementating this if the block has the
+		 * DEDUP flag set it must exist in the DEDUP table, but
+		 * there are many advocates that want ability to remove
+		 * entries from DDT with refcnt=1. If this will happen,
+		 * we may have a block with the DEDUP set, but which doesn't
+		 * have a corresponding entry in the DDT. Be ready.
+		 */
+		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
+		ddt_remove(ddt, dde);
+		result = B_FALSE;
+	}
+
+	ddt_exit(ddt);
+	spa_config_exit(spa, SCL_ZIO, FTAG);
+
+	return (result);
+}
+
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 9b8fc7e49..e6bade11c 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -29,6 +29,7 @@
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
@@ -52,6 +53,7 @@
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
@@ -513,7 +515,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
-	ASSERT(length <= DMU_MAX_ACCESS);
+	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
@@ -2165,6 +2167,155 @@ restart:
 	return (err);
 }
 
+int
+dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+    dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp)
+{
+	dmu_buf_t **dbp, *dbuf;
+	dmu_buf_impl_t *db;
+	blkptr_t *bp;
+	int error, numbufs;
+
+	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+	    &numbufs, &dbp);
+	if (error != 0) {
+		if (error == ESRCH) {
+			error = SET_ERROR(ENXIO);
+		}
+		return (error);
+	}
+
+	ASSERT3U(numbufs, <=, *nbpsp);
+
+	for (int i = 0; i < numbufs; i++) {
+		dbuf = dbp[i];
+		db = (dmu_buf_impl_t *)dbuf;
+		bp = db->db_blkptr;
+
+		/*
+		 * If the block is not on the disk yet, it has no BP assigned.
+		 * There is not much we can do...
+		 */
+		if (!list_is_empty(&db->db_dirty_records)) {
+			dbuf_dirty_record_t *dr;
+
+			dr = list_head(&db->db_dirty_records);
+			if (dr->dt.dl.dr_brtwrite) {
+				/*
+				 * This is very special case where we clone a
+				 * block and in the same transaction group we
+				 * read its BP (most likely to clone the clone).
+				 */
+				bp = &dr->dt.dl.dr_overridden_by;
+			} else {
+				/*
+				 * The block was modified in the same
+				 * transaction group.
+				 */
+				error = SET_ERROR(EAGAIN);
+				goto out;
+			}
+		}
+		if (bp == NULL) {
+			/*
+			 * The block was created in this transaction group,
+			 * so it has no BP yet.
+			 */
+			error = SET_ERROR(EAGAIN);
+			goto out;
+		}
+		if (dmu_buf_is_dirty(dbuf, tx)) {
+			error = SET_ERROR(EAGAIN);
+			goto out;
+		}
+		/*
+		 * Make sure we clone only data blocks.
+		 */
+		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		bps[i] = *bp;
+	}
+
+	*nbpsp = numbufs;
+out:
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (error);
+}
+
+void
+dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+{
+	spa_t *spa;
+	dmu_buf_t **dbp, *dbuf;
+	dmu_buf_impl_t *db;
+	struct dirty_leaf *dl;
+	dbuf_dirty_record_t *dr;
+	const blkptr_t *bp;
+	int numbufs;
+
+	spa = os->os_spa;
+
+	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+	    &numbufs, &dbp));
+	ASSERT3U(nbps, ==, numbufs);
+
+	for (int i = 0; i < numbufs; i++) {
+		dbuf = dbp[i];
+		db = (dmu_buf_impl_t *)dbuf;
+		bp = &bps[i];
+
+		ASSERT0(db->db_level);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
+
+		if (db->db_state == DB_UNCACHED) {
+			/*
+			 * XXX-PJD: If the dbuf is already cached, calling
+			 * dmu_buf_will_not_fill() will panic on assertion
+			 * (db->db_buf == NULL) in dbuf_clear_data(),
+			 * which is called from dbuf_noread() in DB_NOFILL
+			 * case. I'm not 100% sure this is the right thing
+			 * to do, but it seems to work.
+			 */
+			dmu_buf_will_not_fill(dbuf, tx);
+		}
+
+		dr = list_head(&db->db_dirty_records);
+		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+		dl = &dr->dt.dl;
+		dl->dr_overridden_by = *bp;
+		dl->dr_brtwrite = B_TRUE;
+
+		dl->dr_override_state = DR_OVERRIDDEN;
+		if (BP_IS_HOLE(bp)) {
+			dl->dr_overridden_by.blk_birth = 0;
+			dl->dr_overridden_by.blk_phys_birth = 0;
+		} else {
+			dl->dr_overridden_by.blk_birth = dr->dr_txg;
+			dl->dr_overridden_by.blk_phys_birth =
+			    BP_PHYSICAL_BIRTH(bp);
+		}
+
+		/*
+		 * When data in embedded into BP there is no need to create
+		 * BRT entry as there is no data block. Just copy the BP as
+		 * it contains the data.
+		 * Also, when replaying ZIL we don't want to bump references
+		 * in the BRT as it was already done during ZIL claim.
+		 */
+		if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+			brt_pending_add(spa, bp, tx);
+		}
+	}
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 815e27a6c..1c5608c45 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -349,7 +349,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx)
 }
 
 static void
-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dmu_tx_t *tx = txh->txh_tx;
 	dnode_t *dn = txh->txh_dnode;
@@ -357,15 +357,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 
 	ASSERT(tx->tx_txg == 0);
 
-	dmu_tx_count_dnode(txh);
-
 	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
 
-	dmu_tx_count_dnode(txh);
-
 	/*
 	 * For i/o error checking, we read the first and last level-0
 	 * blocks if they are not aligned, and all the level-1 blocks.
@@ -445,8 +441,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_FREE, off, len);
-	if (txh != NULL)
-		(void) dmu_tx_hold_free_impl(txh, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_dnode(txh);
+		dmu_tx_count_free(txh, off, len);
+	}
 }
 
 void
@@ -455,8 +453,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
-	if (txh != NULL)
-		(void) dmu_tx_hold_free_impl(txh, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_dnode(txh);
+		dmu_tx_count_free(txh, off, len);
+	}
+}
+
+static void
+dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+
+	/*
+	 * Reuse dmu_tx_count_free(), it does exactly what we need for clone.
+	 */
+	dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_dnode(txh);
+		dmu_tx_count_clone(txh, off, len);
+	}
 }
 
 static void
@@ -667,6 +692,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 			case THT_NEWOBJECT:
 				match_object = TRUE;
 				break;
+			case THT_CLONE:
+				if (blkid >= beginblk && blkid <= endblk)
+					match_offset = TRUE;
+				break;
 			default:
 				cmn_err(CE_PANIC, "bad txh_type %d",
 				    txh->txh_type);
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 021cba68c..8e3fd126c 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -47,6 +47,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
@@ -3499,11 +3500,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 		scn->scn_dedup_frees_this_txg = 0;
 
 		/*
-		 * Write out changes to the DDT that may be required as a
-		 * result of the blocks freed.  This ensures that the DDT
-		 * is clean when a scrub/resilver runs.
+		 * Write out changes to the DDT and the BRT that may be required
+		 * as a result of the blocks freed.  This ensures that the DDT
+		 * and the BRT are clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
+		brt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return (err);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 6be6fe115..98a302237 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -52,6 +52,7 @@
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_removal.h>
@@ -341,6 +342,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
+		    brt_get_used(spa), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
+		    brt_get_saved(spa), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
+		    brt_get_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
@@ -1707,6 +1714,7 @@ spa_unload(spa_t *spa)
 	}
 
 	ddt_unload(spa);
+	brt_unload(spa);
 	spa_unload_log_sm_metadata(spa);
 
 	/*
@@ -4415,6 +4423,21 @@ spa_ld_load_dedup_tables(spa_t *spa)
 }
 
 static int
+spa_ld_load_brt(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	error = brt_load(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "brt_load failed [error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	return (0);
+}
+
+static int
 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
@@ -4895,6 +4918,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	if (error != 0)
 		return (error);
 
+	error = spa_ld_load_brt(spa);
+	if (error != 0)
+		return (error);
+
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
@@ -5963,6 +5990,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
+	/*
+	 * Create BRT table and BRT table object.
+	 */
+	brt_create(spa);
 
 	spa_update_dspace(spa);
 
@@ -9138,6 +9169,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
+		brt_sync(spa, txg);
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 		svr_sync(spa, tx);
@@ -9263,6 +9295,13 @@ spa_sync(spa_t *spa, uint64_t txg)
 	    ZIO_FLAG_CANFAIL);
 
 	/*
+	 * Now that there can be no more cloning in this transaction group,
+	 * but we are still before issuing frees, we can process pending BRT
+	 * updates.
+	 */
+	brt_pending_apply(spa, txg);
+
+	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 53763e915..8466fa80e 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -57,6 +57,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/kstat.h>
 #include "zfs_prop.h"
@@ -1834,7 +1835,7 @@ void
 spa_update_dspace(spa_t *spa)
 {
 	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
-	    ddt_get_dedup_dspace(spa);
+	    ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
 	if (spa->spa_nonallocating_dspace > 0) {
 		/*
 		 * Subtract the space provided by all non-allocating vdevs that
@@ -2410,6 +2411,7 @@ spa_init(spa_mode_t mode)
 	unique_init();
 	zfs_btree_init();
 	metaslab_stat_init();
+	brt_init();
 	ddt_init();
 	zio_init();
 	dmu_init();
@@ -2446,6 +2448,7 @@ spa_fini(void)
 	dmu_fini();
 	zio_fini();
 	ddt_fini();
+	brt_fini();
 	metaslab_stat_fini();
 	zfs_btree_fini();
 	unique_fini();
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 24ae0a00d..9b859adc5 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Portions Copyright 2012 Pawel Jakub Dawidek <[email protected]>
+ * Copyright (c) 2012 Pawel Jakub Dawidek
  * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 77bf9140d..d009c58d8 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
  */
 
 
@@ -891,5 +892,56 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 	zil_itx_assign(zilog, itx, tx);
 }
 
+/*
+ * Handles TX_CLONE_RANGE transactions.
+ */
+void
+zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
+    uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
+    size_t nbps)
+{
+	itx_t *itx;
+	lr_clone_range_t *lr;
+	uint64_t partlen, max_log_data;
+	size_t i, partnbps;
+
+	VERIFY(!zil_replaying(zilog, tx));
+
+	if (zp->z_unlinked)
+		return;
+
+	max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
+
+	while (nbps > 0) {
+		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
+		partlen = 0;
+		for (i = 0; i < partnbps; i++) {
+			partlen += BP_GET_LSIZE(&bps[i]);
+		}
+		partlen = MIN(partlen, len);
+
+		itx = zil_itx_create(txtype,
+		    sizeof (*lr) + sizeof (bps[0]) * partnbps);
+		lr = (lr_clone_range_t *)&itx->itx_lr;
+		lr->lr_foid = zp->z_id;
+		lr->lr_offset = off;
+		lr->lr_length = partlen;
+		lr->lr_blksz = blksz;
+		lr->lr_nbps = partnbps;
+		memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
+
+		itx->itx_sync = (zp->z_sync_cnt != 0);
+
+		zil_itx_assign(zilog, itx, tx);
+
+		bps += partnbps;
+		ASSERT3U(nbps, >=, partnbps);
+		nbps -= partnbps;
+		off += partlen;
+		ASSERT3U(len, >=, partlen);
+		len -= partlen;
+	}
+}
+
 ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW,
 	"Largest data block to write to zil");
diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c
index a5dc5c399..9b351eefc 100644
--- a/module/zfs/zfs_quota.c
+++ b/module/zfs/zfs_quota.c
@@ -20,8 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <[email protected]>.
- * All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek
  * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index 32be27a8b..04dfda56b 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 Cyril Plisko. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/types.h>
@@ -1162,6 +1163,34 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 	return (error);
 }
 
+static int
+zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_clone_range_t *lr = arg2;
+	znode_t *zp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * Clones can be logged out of order, so don't be surprised if
+		 * the file is gone - just return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
+	    lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
+
+	zrele(zp);
+	return (error);
+}
+
 /*
  * Callback vectors for replaying records
  */
@@ -1190,4 +1219,5 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_setsaxattr,	/* TX_SETSAXATTR */
 	zfs_replay_rename_exchange,	/* TX_RENAME_EXCHANGE */
 	zfs_replay_rename_whiteout,	/* TX_RENAME_WHITEOUT */
+	zfs_replay_clone_range,	/* TX_CLONE_RANGE */
 };
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 10677d8d9..db80be783 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -24,6 +24,7 @@
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -50,6 +51,7 @@
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/policy.h>
+#include <sys/zfeature.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_vfsops.h>
@@ -501,7 +503,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 	}
 
-	if (zn_rlimit_fsize(zp, uio)) {
+	if (zn_rlimit_fsize_uio(zp, uio)) {
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFBIG));
@@ -995,6 +997,467 @@ zfs_get_done(zgd_t *zgd, int error)
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
+static int
+zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+	int error;
+
+	/* Swap. Not sure if the order of zfs_enter()s is important. */
+	if (zfsvfs1 > zfsvfs2) {
+		zfsvfs_t *tmpzfsvfs;
+
+		tmpzfsvfs = zfsvfs2;
+		zfsvfs2 = zfsvfs1;
+		zfsvfs1 = tmpzfsvfs;
+	}
+
+	error = zfs_enter(zfsvfs1, tag);
+	if (error != 0)
+		return (error);
+	if (zfsvfs1 != zfsvfs2) {
+		error = zfs_enter(zfsvfs2, tag);
+		if (error != 0) {
+			zfs_exit(zfsvfs1, tag);
+			return (error);
+		}
+	}
+
+	return (0);
+}
+
+static void
+zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+
+	zfs_exit(zfsvfs1, tag);
+	if (zfsvfs1 != zfsvfs2)
+		zfs_exit(zfsvfs2, tag);
+}
+
+/*
+ * We split each clone request in chunks that can fit into a single ZIL
+ * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
+ * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
+ * us room for storing 1022 block pointers.
+ *
+ * On success, the function return the number of bytes copied in *lenp.
+ * Note, it doesn't return how much bytes are left to be copied.
+ */
+int
+zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+    uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
+{
+	zfsvfs_t	*inzfsvfs, *outzfsvfs;
+	objset_t	*inos, *outos;
+	zfs_locked_range_t *inlr, *outlr;
+	dmu_buf_impl_t	*db;
+	dmu_tx_t	*tx;
+	zilog_t		*zilog;
+	uint64_t	inoff, outoff, len, done;
+	uint64_t	outsize, size;
+	int		error;
+	int		count = 0;
+	sa_bulk_attr_t	bulk[3];
+	uint64_t	mtime[2], ctime[2];
+	uint64_t	uid, gid, projid;
+	blkptr_t	*bps;
+	size_t		maxblocks, nbps;
+	uint_t		inblksz;
+	uint64_t	clear_setid_bits_txg = 0;
+
+	inoff = *inoffp;
+	outoff = *outoffp;
+	len = *lenp;
+	done = 0;
+
+	inzfsvfs = ZTOZSB(inzp);
+	outzfsvfs = ZTOZSB(outzp);
+	inos = inzfsvfs->z_os;
+	outos = outzfsvfs->z_os;
+
+	/*
+	 * Both source and destination have to belong to the same storage pool.
+	 */
+	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * We need to call zfs_enter() potentially on two different datasets,
+	 * so we need a dedicated function for that.
+	 */
+	error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
+	if (error != 0)
+		return (error);
+
+	ASSERT(!outzfsvfs->z_replay);
+
+	error = zfs_verify_zp(inzp);
+	if (error == 0)
+		error = zfs_verify_zp(outzp);
+	if (error != 0) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (error);
+	}
+
+	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+	    SPA_FEATURE_BLOCK_CLONING)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * We don't copy source file's flags that's why we don't allow to clone
+	 * files that are in quarantine.
+	 */
+	if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EACCES));
+	}
+
+	if (inoff >= inzp->z_size) {
+		*lenp = 0;
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (0);
+	}
+	if (len > inzp->z_size - inoff) {
+		len = inzp->z_size - inoff;
+	}
+	if (len == 0) {
+		*lenp = 0;
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (0);
+	}
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(outzfsvfs)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * If immutable or not appending then return EPERM.
+	 * Intentionally allow ZFS_READONLY through here.
+	 * See zfs_zaccess_common()
+	 */
+	if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * No overlapping if we are cloning within the same file.
+	 */
+	if (inzp == outzp) {
+		if (inoff < outoff + len && outoff < inoff + len) {
+			zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	/*
+	 * Maintain predictable lock order.
+	 */
+	if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
+		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+		    RL_READER);
+		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+		    RL_WRITER);
+	} else {
+		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+		    RL_WRITER);
+		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+		    RL_READER);
+	}
+
+	inblksz = inzp->z_blksz;
+
+	/*
+	 * We cannot clone into files with different block size.
+	 */
+	if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
+		error = SET_ERROR(EXDEV);
+		goto unlock;
+	}
+
+	/*
+	 * Offsets and len must be at block boundries.
+	 */
+	if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
+		error = SET_ERROR(EXDEV);
+		goto unlock;
+	}
+	/*
+	 * Length must be multipe of blksz, except for the end of the file.
+	 */
+	if ((len % inblksz) != 0 &&
+	    (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
+		error = SET_ERROR(EXDEV);
+		goto unlock;
+	}
+
+	error = zn_rlimit_fsize(outoff + len);
+	if (error != 0) {
+		goto unlock;
+	}
+
+	if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
+		error = SET_ERROR(EFBIG);
+		goto unlock;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
+	    &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
+	    &outzp->z_size, 8);
+
+	zilog = outzfsvfs->z_log;
+	maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
+	    sizeof (bps[0]);
+
+	uid = KUID_TO_SUID(ZTOUID(outzp));
+	gid = KGID_TO_SGID(ZTOGID(outzp));
+	projid = outzp->z_projid;
+
+	bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+
+	/*
+	 * Clone the file in reasonable size chunks.  Each chunk is cloned
+	 * in a separate transaction; this keeps the intent log records small
+	 * and allows us to do more fine-grained space accounting.
+	 */
+	while (len > 0) {
+		size = MIN(inblksz * maxblocks, len);
+
+		if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
+		    uid) ||
+		    zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
+		    gid) ||
+		    (projid != ZFS_DEFAULT_PROJID &&
+		    zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
+		    projid))) {
+			error = SET_ERROR(EDQUOT);
+			break;
+		}
+
+		/*
+		 * Start a transaction.
+		 */
+		tx = dmu_tx_create(outos);
+
+		nbps = maxblocks;
+		error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps,
+		    &nbps);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+			/*
+			 * If we are tyring to clone a block that was created
+			 * in the current transaction group. Return an error,
+			 * so the caller can fallback to just copying the data.
+			 */
+			if (error == EAGAIN) {
+				error = SET_ERROR(EXDEV);
+			}
+			break;
+		}
+		/*
+		 * Encrypted data is fine as long as it comes from the same
+		 * dataset.
+		 * TODO: We want to extend it in the future to allow cloning to
+		 * datasets with the same keys, like clones or to be able to
+		 * clone a file from a snapshot of an encrypted dataset into the
+		 * dataset itself.
+		 */
+		if (BP_IS_PROTECTED(&bps[0])) {
+			if (inzfsvfs != outzfsvfs) {
+				dmu_tx_abort(tx);
+				error = SET_ERROR(EXDEV);
+				break;
+			}
+		}
+
+		dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
+		db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
+		DB_DNODE_ENTER(db);
+		dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
+		DB_DNODE_EXIT(db);
+		zfs_sa_upgrade_txholds(tx, outzp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+			break;
+		}
+
+		/*
+		 * Copy source znode's block size. This only happens on the
+		 * first iteration since zfs_rangelock_reduce() will shrink down
+		 * lr_len to the appropriate size.
+		 */
+		if (outlr->lr_length == UINT64_MAX) {
+			zfs_grow_blocksize(outzp, inblksz, tx);
+			/*
+			 * Round range lock up to the block boundary, so we
+			 * prevent appends until we are done.
+			 */
+			zfs_rangelock_reduce(outlr, outoff,
+			    ((len - 1) / inblksz + 1) * inblksz);
+		}
+
+		dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps,
+		    B_FALSE);
+
+		zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
+		    &clear_setid_bits_txg, tx);
+
+		zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
+
+		/*
+		 * Update the file size (zp_size) if it has changed;
+		 * account for possible concurrent updates.
+		 */
+		while ((outsize = outzp->z_size) < outoff + size) {
+			(void) atomic_cas_64(&outzp->z_size, outsize,
+			    outoff + size);
+		}
+
+		error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
+
+		zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
+		    size, inblksz, bps, nbps);
+
+		dmu_tx_commit(tx);
+
+		if (error != 0)
+			break;
+
+		inoff += size;
+		outoff += size;
+		len -= size;
+		done += size;
+	}
+
+	kmem_free(bps, sizeof (bps[0]) * maxblocks);
+	zfs_znode_update_vfs(outzp);
+
+unlock:
+	zfs_rangelock_exit(outlr);
+	zfs_rangelock_exit(inlr);
+
+	if (done > 0) {
+		/*
+		 * If we have made at least partial progress, reset the error.
+		 */
+		error = 0;
+
+		ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
+
+		if (outos->os_sync == ZFS_SYNC_ALWAYS) {
+			zil_commit(zilog, outzp->z_id);
+		}
+
+		*inoffp += done;
+		*outoffp += done;
+		*lenp = done;
+	}
+
+	zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
+ * but we cannot do that, because when replaying we don't have source znode
+ * available. This is why we need a dedicated replay function.
+ */
+int
+zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
+    const blkptr_t *bps, size_t nbps)
+{
+	zfsvfs_t	*zfsvfs;
+	dmu_buf_impl_t	*db;
+	dmu_tx_t	*tx;
+	int		error;
+	int		count = 0;
+	sa_bulk_attr_t	bulk[3];
+	uint64_t	mtime[2], ctime[2];
+
+	ASSERT3U(off, <, MAXOFFSET_T);
+	ASSERT3U(len, >, 0);
+	ASSERT3U(nbps, >, 0);
+
+	zfsvfs = ZTOZSB(zp);
+
+	ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+	    SPA_FEATURE_BLOCK_CLONING));
+
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
+
+	ASSERT(zfsvfs->z_replay);
+	ASSERT(!zfs_is_readonly(zfsvfs));
+
+	if ((off % blksz) != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+
+	/*
+	 * Start a transaction.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+	DB_DNODE_ENTER(db);
+	dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
+	DB_DNODE_EXIT(db);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0) {
+		dmu_tx_abort(tx);
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}
+
+	if (zp->z_blksz < blksz)
+		zfs_grow_blocksize(zp, blksz, tx);
+
+	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+	if (zp->z_size < off + len)
+		zp->z_size = off + len;
+
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+	/*
+	 * zil_replaying() not only check if we are replaying ZIL, but also
+	 * updates the ZIL header to record replay progress.
+	 */
+	VERIFY(zil_replaying(zfsvfs->z_log, tx));
+
+	dmu_tx_commit(tx);
+
+	zfs_znode_update_vfs(zp);
+
+	zfs_exit(zfsvfs, FTAG);
+
+	return (error);
+}
+
 EXPORT_SYMBOL(zfs_access);
 EXPORT_SYMBOL(zfs_fsync);
 EXPORT_SYMBOL(zfs_holey);
@@ -1002,6 +1465,8 @@ EXPORT_SYMBOL(zfs_read);
 EXPORT_SYMBOL(zfs_write);
 EXPORT_SYMBOL(zfs_getsecattr);
 EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_clone_range);
+EXPORT_SYMBOL(zfs_clone_range_replay);
 
 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
 	"Bytes to read per chunk");
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index fcf4e7357..fba1c1999 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -43,6 +43,7 @@
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
@@ -578,14 +579,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 }
 
 static int
-zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
-    uint64_t first_txg)
+zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
-	if (lrc->lrc_txtype != TX_WRITE)
-		return (0);
+	ASSERT(lrc->lrc_txtype == TX_WRITE);
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
@@ -605,6 +604,57 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 }
 
 static int
+zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+	const blkptr_t *bp;
+	spa_t *spa;
+	uint_t ii;
+
+	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+
+	if (tx == NULL) {
+		return (0);
+	}
+
+	/*
+	 * XXX: Do we need to byteswap lr?
+	 */
+
+	spa = zilog->zl_spa;
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+
+		/*
+		 * When data in embedded into BP there is no need to create
+		 * BRT entry as there is no data block. Just copy the BP as
+		 * it contains the data.
+		 */
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+			brt_pending_add(spa, bp, tx);
+		}
+	}
+
+	return (0);
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
+{
+
+	switch (lrc->lrc_txtype) {
+	case TX_WRITE:
+		return (zil_claim_write(zilog, lrc, tx, first_txg));
+	case TX_CLONE_RANGE:
+		return (zil_claim_clone_range(zilog, lrc, tx));
+	default:
+		return (0);
+	}
+}
+
+static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
@@ -616,24 +666,71 @@ zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 }
 
 static int
-zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
-    uint64_t claim_txg)
+zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
+	ASSERT(lrc->lrc_txtype == TX_WRITE);
+
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
-	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-	    !BP_IS_HOLE(bp))
+	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+	    !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+	}
 
 	return (0);
 }
 
 static int
+zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+	const blkptr_t *bp;
+	spa_t *spa;
+	uint_t ii;
+
+	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+
+	if (tx == NULL) {
+		return (0);
+	}
+
+	spa = zilog->zl_spa;
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+
+		if (!BP_IS_HOLE(bp)) {
+			zio_free(spa, dmu_tx_get_txg(tx), bp);
+		}
+	}
+
+	return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t claim_txg)
+{
+
+	if (claim_txg == 0) {
+		return (0);
+	}
+
+	switch (lrc->lrc_txtype) {
+	case TX_WRITE:
+		return (zil_free_write(zilog, lrc, tx, claim_txg));
+	case TX_CLONE_RANGE:
+		return (zil_free_clone_range(zilog, lrc, tx));
+	default:
+		return (0);
+	}
+}
+
+static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
@@ -1798,13 +1895,12 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 }
 
 /*
- * Maximum amount of write data that can be put into single log block.
+ * Maximum amount of data that can be put into single log block.
  */
 uint64_t
-zil_max_log_data(zilog_t *zilog)
+zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
-	return (zilog->zl_max_block_size -
-	    sizeof (zil_chain_t) - sizeof (lr_write_t));
+	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
@@ -1814,7 +1910,7 @@ zil_max_log_data(zilog_t *zilog)
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
-	return (zil_max_log_data(zilog) / 8);
+	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8);
 }
 
 /*
@@ -1887,7 +1983,7 @@ cont:
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-	max_log_data = zil_max_log_data(zilog);
+	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d17ee60dc..1b1a1831f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -41,6 +41,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
@@ -1176,12 +1177,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
 }
 
 void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
+    boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
@@ -1190,6 +1193,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
+	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
@@ -1222,7 +1226,8 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
-	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
+	    brt_maybe_exists(spa, bp)) {
 		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
@@ -1249,11 +1254,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
-	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+	if (BP_IS_GANG(bp) ||
+	    BP_GET_DEDUP(bp) ||
+	    brt_maybe_exists(spa, bp)) {
 		/*
-		 * GANG and DEDUP blocks can induce a read (for the gang block
-		 * header, or the DDT), so issue them asynchronously so that
-		 * this thread is not tied up.
+		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
+		 * block header, the DDT or the BRT), so issue them
+		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
@@ -1594,11 +1601,15 @@ zio_write_bp_init(zio_t *zio)
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(bp->blk_birth != zio->io_txg);
-		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		if (zp->zp_brtwrite)
+			return (zio);
+
+		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
+
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
@@ -3044,6 +3055,35 @@ zio_nop_write(zio_t *zio)
 
 /*
  * ==========================================================================
+ * Block Reference Table
+ * ==========================================================================
+ */
+static zio_t *
+zio_brt_free(zio_t *zio)
+{
+	blkptr_t *bp;
+
+	bp = zio->io_bp;
+
+	if (BP_GET_LEVEL(bp) > 0 ||
+	    BP_IS_METADATA(bp) ||
+	    !brt_maybe_exists(zio->io_spa, bp)) {
+		return (zio);
+	}
+
+	if (!brt_entry_decref(zio->io_spa, bp)) {
+		/*
+		 * This isn't the last reference, so we cannot free
+		 * the data yet.
+		 */
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	}
+
+	return (zio);
+}
+
+/*
+ * ==========================================================================
  * Dedup
  * ==========================================================================
  */
@@ -4894,6 +4934,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
+	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 1511f763f..06bc75c63 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -482,6 +482,60 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	return (error);
 }
 
+/*
+ * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
+ * after a system failure.
+ *
+ * TODO: For now we drop block cloning transations for ZVOLs as they are
+ *       unsupported, but we still need to inform BRT about that as we
+ *       claimed them during pool import.
+ *       This situation can occur when we try to import a pool from a ZFS
+ *       version supporting block cloning for ZVOLs into a system that
+ *       has this ZFS version, that doesn't support block cloning for ZVOLs.
+ */
+static int
+zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	zvol_state_t *zv = arg1;
+	objset_t *os = zv->zv_objset;
+	lr_clone_range_t *lr = arg2;
+	blkptr_t *bp;
+	dmu_tx_t *tx;
+	spa_t *spa;
+	uint_t ii;
+	int error;
+
+	dmu_objset_name(os, name);
+	cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
+	    name);
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	tx = dmu_tx_create(os);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	spa = os->os_spa;
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+
+		if (!BP_IS_HOLE(bp)) {
+			zio_free(spa, dmu_tx_get_txg(tx), bp);
+		}
+	}
+
+	(void) zil_replaying(zv->zv_zilog, tx);
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
@@ -516,6 +570,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* TX_SETSAXATTR */
 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
+	zvol_replay_clone_range	/* TX_CLONE_RANGE */
 };
 
 /*