aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/brt.c1884
-rw-r--r--module/zfs/dbuf.c126
-rw-r--r--module/zfs/ddt.c55
-rw-r--r--module/zfs/dmu.c153
-rw-r--r--module/zfs/dmu_tx.c47
-rw-r--r--module/zfs/dsl_scan.c8
-rw-r--r--module/zfs/spa.c39
-rw-r--r--module/zfs/spa_misc.c5
-rw-r--r--module/zfs/zfs_ioctl.c2
-rw-r--r--module/zfs/zfs_log.c52
-rw-r--r--module/zfs/zfs_quota.c3
-rw-r--r--module/zfs/zfs_replay.c30
-rw-r--r--module/zfs/zfs_vnops.c467
-rw-r--r--module/zfs/zil.c126
-rw-r--r--module/zfs/zio.c55
-rw-r--r--module/zfs/zvol.c55
16 files changed, 3028 insertions, 79 deletions
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
new file mode 100644
index 000000000..ca9c4e678
--- /dev/null
+++ b/module/zfs/brt.c
@@ -0,0 +1,1884 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/brt.h>
+#include <sys/ddt.h>
+#include <sys/bitmap.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/kstat.h>
+#include <sys/wmsum.h>
+
+/*
+ * Block Cloning design.
+ *
+ * Block Cloning allows to manually clone a file (or a subset of its blocks)
+ * into another (or the same) file by just creating additional references to
+ * the data blocks without copying the data itself. Those references are kept
+ * in the Block Reference Tables (BRTs).
+ *
+ * In many ways this is similar to the existing deduplication, but there are
+ * some important differences:
+ *
+ * - Deduplication is automatic and Block Cloning is not - one has to use a
+ * dedicated system call(s) to clone the given file/blocks.
+ * - Deduplication keeps all data blocks in its table, even those referenced
+ * just once. Block Cloning creates an entry in its tables only when there
+ * are at least two references to the given data block. If the block was
+ * never explicitly cloned or the second to last reference was dropped,
+ * there will be neither space nor performance overhead.
+ * - Deduplication needs data to work - one needs to pass real data to the
+ * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
+ * data, just block pointers to the data, so it is extremely fast, as we pay
+ * neither the cost of reading the data, nor the cost of writing the data -
+ * we operate exclusively on metadata.
+ * - If the D (dedup) bit is not set in the block pointer, it means that
+ * the block is not in the dedup table (DDT) and we won't consult the DDT
+ * when we need to free the block. Block Cloning must be consulted on every
+ * free, because we cannot modify the source BP (eg. by setting something
+ * similar to the D bit), thus we have no hint if the block is in the
+ * Block Reference Table (BRT), so we need to look into the BRT. There is
+ * an optimization in place that allows us to eliminate the majority of BRT
+ * lookups which is described below in the "Minimizing free penalty" section.
+ * - The BRT entry is much smaller than the DDT entry - for BRT we only store
+ * 64bit offset and 64bit reference counter.
+ * - Dedup keys are cryptographic hashes, so two blocks that are close to each
+ * other on disk are most likely in totally different parts of the DDT.
+ * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
+ * from one file should have BRT entries close to each other.
+ * - Scrub will only do a single pass over a block that is referenced multiple
+ * times in the DDT. Unfortunately it is not currently (if at all) possible
+ * with Block Cloning and block referenced multiple times will be scrubbed
+ * multiple times. The new, sorted scrub should be able to eliminate
+ * duplicated reads given enough memory.
+ * - Deduplication requires cryptographically strong hash as a checksum or
+ * additional data verification. Block Cloning works with any checksum
+ * algorithm or even with checksumming disabled.
+ *
+ * As mentioned above, the BRT entries are much smaller than the DDT entries.
+ * To uniquely identify a block we just need its vdev id and offset. We also
+ * need to maintain a reference counter. The vdev id will often repeat, as there
+ * is a small number of top-level VDEVs and a large number of blocks stored in
+ * each VDEV. We take advantage of that to reduce the BRT entry size further by
+ * maintaining one BRT for each top-level VDEV, so we can then have only offset
+ * and counter as the BRT entry.
+ *
+ * Minimizing free penalty.
+ *
+ * Block Cloning allows creating additional references to any existing block.
+ * When we free a block there is no hint in the block pointer whether the block
+ * was cloned or not, so on each free we have to check if there is a
+ * corresponding entry in the BRT or not. If there is, we need to decrease
+ * the reference counter. Doing BRT lookup on every free can potentially be
+ * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
+ * This is the main problem with deduplication, so we've learned our lesson and
+ * try not to repeat the same mistake here. How do we do that? We divide each
+ * top-level VDEV into 16MB regions. For each region we maintain a counter that
+ * is a sum of all the BRT entries that have offsets within the region. This
+ * creates the entries count array of 16bit numbers for each top-level VDEV.
+ * The entries count array is always kept in memory and updated on disk in the
+ * same transaction group as the BRT updates to keep everything in-sync. We can
+ * keep the array in memory, because it is very small. With 16MB regions and
+ * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
+ * the region size even further in the future). Now, when we want to free
+ * a block, we first consult the array. If the counter for the whole region is
+ * zero, there is no need to look for the BRT entry, as there isn't one for
+ * sure. If the counter for the region is greater than zero, only then we will
+ * do a BRT lookup and if an entry is found we will decrease the reference
+ * counter in the BRT entry and in the entry counters array.
+ *
+ * The entry counters array is small, but can potentially be larger for very
+ * large VDEVs or smaller regions. In this case we don't want to rewrite entire
+ * array on every change. We then divide the array into 32kB block and keep
+ * a bitmap of dirty blocks within a transaction group. When we sync the
+ * transaction group we can only update the parts of the entry counters array
+ * that were modified. Note: Keeping track of the dirty parts of the entry
+ * counters array is implemented, but updating only parts of the array on disk
+ * is not yet implemented - for now we will update entire array if there was
+ * any change.
+ *
+ * The implementation tries to be economic: if BRT is not used, or no longer
+ * used, there will be no entries in the MOS and no additional memory used (eg.
+ * the entry counters array is only allocated if needed).
+ *
+ * Interaction between Deduplication and Block Cloning.
+ *
+ * If both functionalities are in use, we could end up with a block that is
+ * referenced multiple times in both DDT and BRT. When we free one of the
+ * references we couldn't tell where it belongs, so we would have to decide
+ * what table takes the precedence: do we first clear DDT references or BRT
+ * references? To avoid this dilemma BRT cooperates with DDT - if a given block
+ * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
+ * lookup DDT entry instead and increase the counter there. No BRT entry
+ * will be created for a block which has the D (dedup) bit set.
+ * BRT may be more efficient for manual deduplication, but if the block is
+ * already in the DDT, then creating additional BRT entry would be less
+ * efficient. This clever idea was proposed by Allan Jude.
+ *
+ * Block Cloning across datasets.
+ *
+ * Block Cloning is not limited to cloning blocks within the same dataset.
+ * It is possible (and very useful) to clone blocks between different datasets.
+ * One use case is recovering files from snapshots. By cloning the files into
+ * dataset we need no additional storage. Without Block Cloning we would need
+ * additional space for those files.
+ * Another interesting use case is moving the files between datasets
+ * (copying the file content to the new dataset and removing the source file).
+ * In that case Block Cloning will only be used briefly, because the BRT entries
+ * will be removed when the source is removed.
+ * Note: currently it is not possible to clone blocks between encrypted
+ * datasets, even if those datasets use the same encryption key (this includes
+ * snapshots of encrypted datasets). Cloning blocks between datasets that use
+ * the same keys should be possible and should be implemented in the future.
+ *
+ * Block Cloning flow through ZFS layers.
+ *
+ * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
+ * blocks. As of this writing no interface is implemented that allows for block
+ * cloning within a ZVOL.
+ * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
+ * for blocking cloning.
+ *
+ * ssize_t
+ * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
+ * size_t len, unsigned int flags);
+ *
+ * Even though offsets and length represent bytes, they have to be
+ * block-aligned or we will return the EXDEV error so the upper layer can
+ * fallback to the generic mechanism that will just copy the data.
+ * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
+ * This function was implemented based on zfs_write(), but instead of writing
+ * the given data we first read block pointers using the new dmu_read_l0_bps()
+ * function from the source file. Once we have BPs from the source file we call
+ * the dmu_brt_clone() function on the destination file. This function
+ * allocates BPs for us. We iterate over all source BPs. If the given BP is
+ * a hole or an embedded block, we just copy BP as-is. If it points to a real
+ * data we place this BP on a BRT pending list using the brt_pending_add()
+ * function.
+ *
+ * We use this pending list to keep track of all BPs that got new references
+ * within this transaction group.
+ *
+ * Some special cases to consider and how we address them:
+ * - The block we want to clone may have been created within the same
+ * transaction group that we are trying to clone. Such block has no BP
+ * allocated yet, so cannot be immediately cloned. We return EXDEV.
+ * - The block we want to clone may have been modified within the same
+ * transaction group. We return EXDEV.
+ * - A block may be cloned multiple times during one transaction group (that's
+ * why pending list is actually a tree and not an append-only list - this
+ * way we can figure out faster if this block is cloned for the first time
+ * in this txg or consecutive time).
+ * - A block may be cloned and freed within the same transaction group
+ * (see dbuf_undirty()).
+ * - A block may be cloned and within the same transaction group the clone
+ * can be cloned again (see dmu_read_l0_bps()).
+ * - A file might have been deleted, but the caller still has a file descriptor
+ * open to this file and clones it.
+ *
+ * When we free a block we have an additional step in the ZIO pipeline where we
+ * call the zio_brt_free() function. We then call the brt_entry_decref()
+ * that loads the corresponding BRT entry (if one exists) and decreases
+ * reference counter. If this is not the last reference we will stop ZIO
+ * pipeline here. If this is the last reference or the block is not in the
+ * BRT, we continue the pipeline and free the block as usual.
+ *
+ * At the beginning of spa_sync() where there can be no more block cloning,
+ * but before issuing frees we call brt_pending_apply(). This function applies
+ * all the new clones to the BRT table - we load BRT entries and update
+ * reference counters. To sync new BRT entries to disk, we use brt_sync()
+ * function. This function will sync all dirty per-top-level-vdev BRTs,
+ * the entry counters arrays, etc.
+ *
+ * Block Cloning and ZIL.
+ *
+ * Every clone operation is divided into chunks (similar to write) and each
+ * chunk is cloned in a separate transaction. The chunk size is determined by
+ * how many BPs we can fit into a single ZIL entry.
+ * Replaying clone operation is different from the regular clone operation,
+ * as when we log clone operations we cannot use the source object - it may
+ * reside on a different dataset, so we log BPs we want to clone.
+ * The ZIL is replayed when we mount the given dataset, not when the pool is
+ * imported. Taking this into account it is possible that the pool is imported
+ * without mounting datasets and the source dataset is destroyed before the
+ * destination dataset is mounted and its ZIL replayed.
+ * To address this situation we leverage zil_claim() mechanism where ZFS will
+ * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
+ * entries, we will bump reference counters for their BPs in the BRT and then
+ * on mount and ZIL replay we will just attach BPs to the file without
+ * bumping reference counters.
+ * Note it is still possible that after zil_claim() we never mount the
+ * destination, so we never replay its ZIL and we destroy it. This way we would
+ * end up with leaked references in BRT. We address that too as ZFS gives us
+ * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ */
+
+/*
+ * BRT - Block Reference Table.
+ */
+#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
+
+/*
+ * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
+ * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
+ * Each element in this array represents how many BRT entries do we have in this
+ * chunk of storage. We always load this entire array into memory and update as
+ * needed. By having it in memory we can quickly tell (during zio_free()) if
+ * there are any BRT entries that we might need to update.
+ *
+ * This value cannot be larger than 16MB, at least as long as we support
+ * 512 byte block sizes. With 512 byte block size we can have exactly
+ * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
+ * many for a 16bit counter.
+ */
+#define BRT_RANGESIZE (16 * 1024 * 1024)
+_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
+ "BRT_RANGESIZE is too large.");
+/*
+ * We don't want to update the whole structure every time. Maintain bitmap
+ * of dirty blocks within the regions, so that a single bit represents a
+ * block size of entcounts. For example if we have a 1PB vdev then all
+ * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
+ * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
+ * the whole 128MB on disk when we have updated only a single entcount.
+ * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
+ * is represented by a single bit. This gives us 4096 bits. A set bit in the
+ * bitmap means that we had a change in at least one of the 16384 entcounts
+ * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
+ */
+#define BRT_BLOCKSIZE (32 * 1024)
+#define BRT_RANGESIZE_TO_NBLOCKS(size) \
+ (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+
+#define BRT_LITTLE_ENDIAN 0
+#define BRT_BIG_ENDIAN 1
+#ifdef _ZFS_LITTLE_ENDIAN
+#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
+#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
+#else
+#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
+#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
+#endif
+
+typedef struct brt_vdev_phys {
+ uint64_t bvp_mos_entries;
+ uint64_t bvp_size;
+ uint64_t bvp_byteorder;
+ uint64_t bvp_totalcount;
+ uint64_t bvp_rangesize;
+ uint64_t bvp_usedspace;
+ uint64_t bvp_savedspace;
+} brt_vdev_phys_t;
+
+typedef struct brt_vdev {
+ /*
+ * VDEV id.
+ */
+ uint64_t bv_vdevid;
+ /*
+ * Is the structure initiated?
+ * (bv_entcount and bv_bitmap are allocated?)
+ */
+ boolean_t bv_initiated;
+ /*
+ * Object number in the MOS for the entcount array and brt_vdev_phys.
+ */
+ uint64_t bv_mos_brtvdev;
+ /*
+ * Object number in the MOS for the entries table.
+ */
+ uint64_t bv_mos_entries;
+ /*
+ * Entries to sync.
+ */
+ avl_tree_t bv_tree;
+ /*
+ * Does the bv_entcount[] array needs byte swapping?
+ */
+ boolean_t bv_need_byteswap;
+ /*
+ * Number of entries in the bv_entcount[] array.
+ */
+ uint64_t bv_size;
+ /*
+ * This is the array with BRT entry count per BRT_RANGESIZE.
+ */
+ uint16_t *bv_entcount;
+ /*
+ * Sum of all bv_entcount[]s.
+ */
+ uint64_t bv_totalcount;
+ /*
+ * Space on disk occupied by cloned blocks (without compression).
+ */
+ uint64_t bv_usedspace;
+ /*
+ * How much additional space would be occupied without block cloning.
+ */
+ uint64_t bv_savedspace;
+ /*
+ * brt_vdev_phys needs updating on disk.
+ */
+ boolean_t bv_meta_dirty;
+ /*
+ * bv_entcount[] needs updating on disk.
+ */
+ boolean_t bv_entcount_dirty;
+ /*
+ * bv_entcount[] potentially can be a bit too big to sychronize it all
+ * when we just changed few entcounts. The fields below allow us to
+ * track updates to bv_entcount[] array since the last sync.
+ * A single bit in the bv_bitmap represents as many entcounts as can
+ * fit into a single BRT_BLOCKSIZE.
+ * For example we have 65536 entcounts in the bv_entcount array
+ * (so the whole array is 128kB). We updated bv_entcount[2] and
+ * bv_entcount[5]. In that case only first bit in the bv_bitmap will
+ * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
+ */
+ ulong_t *bv_bitmap;
+ uint64_t bv_nblocks;
+} brt_vdev_t;
+
+/*
+ * In-core brt
+ */
+typedef struct brt {
+ krwlock_t brt_lock;
+ spa_t *brt_spa;
+#define brt_mos brt_spa->spa_meta_objset
+ uint64_t brt_rangesize;
+ uint64_t brt_usedspace;
+ uint64_t brt_savedspace;
+ avl_tree_t brt_pending_tree[TXG_SIZE];
+ kmutex_t brt_pending_lock[TXG_SIZE];
+ /* Sum of all entries across all bv_trees. */
+ uint64_t brt_nentries;
+ brt_vdev_t *brt_vdevs;
+ uint64_t brt_nvdevs;
+} brt_t;
+
+/* Size of bre_offset / sizeof (uint64_t). */
+#define BRT_KEY_WORDS (1)
+
+/*
+ * In-core brt entry.
+ * On-disk we use bre_offset as the key and bre_refcount as the value.
+ */
+typedef struct brt_entry {
+ uint64_t bre_offset;
+ uint64_t bre_refcount;
+ avl_node_t bre_node;
+} brt_entry_t;
+
+typedef struct brt_pending_entry {
+ blkptr_t bpe_bp;
+ int bpe_count;
+ avl_node_t bpe_node;
+} brt_pending_entry_t;
+
+static kmem_cache_t *brt_entry_cache;
+static kmem_cache_t *brt_pending_entry_cache;
+
+/*
+ * Enable/disable prefetching of BRT entries that we are going to modify.
+ */
+int zfs_brt_prefetch = 1;
+
+#ifdef ZFS_DEBUG
+#define BRT_DEBUG(...) do { \
+ if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
+ __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
+ } \
+} while (0)
+#else
+#define BRT_DEBUG(...) do { } while (0)
+#endif
+
+int brt_zap_leaf_blockshift = 12;
+int brt_zap_indirect_blockshift = 12;
+
+static kstat_t *brt_ksp;
+
+typedef struct brt_stats {
+ kstat_named_t brt_addref_entry_in_memory;
+ kstat_named_t brt_addref_entry_not_on_disk;
+ kstat_named_t brt_addref_entry_on_disk;
+ kstat_named_t brt_addref_entry_read_lost_race;
+ kstat_named_t brt_decref_entry_in_memory;
+ kstat_named_t brt_decref_entry_loaded_from_disk;
+ kstat_named_t brt_decref_entry_not_in_memory;
+ kstat_named_t brt_decref_entry_not_on_disk;
+ kstat_named_t brt_decref_entry_read_lost_race;
+ kstat_named_t brt_decref_entry_still_referenced;
+ kstat_named_t brt_decref_free_data_later;
+ kstat_named_t brt_decref_free_data_now;
+ kstat_named_t brt_decref_no_entry;
+} brt_stats_t;
+
+static brt_stats_t brt_stats = {
+ { "addref_entry_in_memory", KSTAT_DATA_UINT64 },
+ { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
+ { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
+ { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 },
+ { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
+ { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
+ { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
+ { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 },
+ { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
+ { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
+ { "decref_free_data_later", KSTAT_DATA_UINT64 },
+ { "decref_free_data_now", KSTAT_DATA_UINT64 },
+ { "decref_no_entry", KSTAT_DATA_UINT64 }
+};
+
+struct {
+ wmsum_t brt_addref_entry_in_memory;
+ wmsum_t brt_addref_entry_not_on_disk;
+ wmsum_t brt_addref_entry_on_disk;
+ wmsum_t brt_addref_entry_read_lost_race;
+ wmsum_t brt_decref_entry_in_memory;
+ wmsum_t brt_decref_entry_loaded_from_disk;
+ wmsum_t brt_decref_entry_not_in_memory;
+ wmsum_t brt_decref_entry_not_on_disk;
+ wmsum_t brt_decref_entry_read_lost_race;
+ wmsum_t brt_decref_entry_still_referenced;
+ wmsum_t brt_decref_free_data_later;
+ wmsum_t brt_decref_free_data_now;
+ wmsum_t brt_decref_no_entry;
+} brt_sums;
+
+#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
+
+static int brt_entry_compare(const void *x1, const void *x2);
+static int brt_pending_entry_compare(const void *x1, const void *x2);
+
+static void
+brt_rlock(brt_t *brt)
+{
+ rw_enter(&brt->brt_lock, RW_READER);
+}
+
+static void
+brt_wlock(brt_t *brt)
+{
+ rw_enter(&brt->brt_lock, RW_WRITER);
+}
+
+static void
+brt_unlock(brt_t *brt)
+{
+ rw_exit(&brt->brt_lock);
+}
+
+static uint16_t
+brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
+{
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ if (brtvd->bv_need_byteswap) {
+ return (BSWAP_16(brtvd->bv_entcount[idx]));
+ } else {
+ return (brtvd->bv_entcount[idx]);
+ }
+}
+
+static void
+brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
+{
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ if (brtvd->bv_need_byteswap) {
+ brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
+ } else {
+ brtvd->bv_entcount[idx] = entcnt;
+ }
+}
+
+static void
+brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
+{
+ uint16_t entcnt;
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ entcnt = brt_vdev_entcount_get(brtvd, idx);
+ ASSERT(entcnt < UINT16_MAX);
+
+ brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
+}
+
+static void
+brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
+{
+ uint16_t entcnt;
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ entcnt = brt_vdev_entcount_get(brtvd, idx);
+ ASSERT(entcnt > 0);
+
+ brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
+}
+
+#ifdef ZFS_DEBUG
+static void
+brt_vdev_dump(brt_t *brt)
+{
+ brt_vdev_t *brtvd;
+ uint64_t vdevid;
+
+ if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
+ return;
+ }
+
+ if (brt->brt_nvdevs == 0) {
+ zfs_dbgmsg("BRT empty");
+ return;
+ }
+
+ zfs_dbgmsg("BRT vdev dump:");
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ uint64_t idx;
+
+ brtvd = &brt->brt_vdevs[vdevid];
+ zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
+ "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+ (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
+ brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
+ (u_longlong_t)brtvd->bv_size,
+ (u_longlong_t)brtvd->bv_totalcount,
+ (u_longlong_t)brtvd->bv_nblocks,
+ (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+ if (brtvd->bv_totalcount > 0) {
+ zfs_dbgmsg(" entcounts:");
+ for (idx = 0; idx < brtvd->bv_size; idx++) {
+ if (brt_vdev_entcount_get(brtvd, idx) > 0) {
+ zfs_dbgmsg(" [%04llu] %hu",
+ (u_longlong_t)idx,
+ brt_vdev_entcount_get(brtvd, idx));
+ }
+ }
+ }
+ if (brtvd->bv_entcount_dirty) {
+ char *bitmap;
+
+ bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
+ for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+ bitmap[idx] =
+ BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
+ }
+ bitmap[idx] = '\0';
+ zfs_dbgmsg(" bitmap: %s", bitmap);
+ kmem_free(bitmap, brtvd->bv_nblocks + 1);
+ }
+ }
+}
+#endif
+
+static brt_vdev_t *
+brt_vdev(brt_t *brt, uint64_t vdevid)
+{
+ brt_vdev_t *brtvd;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+ if (vdevid < brt->brt_nvdevs) {
+ brtvd = &brt->brt_vdevs[vdevid];
+ } else {
+ brtvd = NULL;
+ }
+
+ return (brtvd);
+}
+
+static void
+brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+ char name[64];
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT0(brtvd->bv_mos_brtvdev);
+ ASSERT0(brtvd->bv_mos_entries);
+ ASSERT(brtvd->bv_entcount != NULL);
+ ASSERT(brtvd->bv_size > 0);
+ ASSERT(brtvd->bv_bitmap != NULL);
+ ASSERT(brtvd->bv_nblocks > 0);
+
+ brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
+ ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
+ brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
+ 0, tx);
+ VERIFY(brtvd->bv_mos_entries != 0);
+ BRT_DEBUG("MOS entries created, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_entries);
+
+ /*
+ * We allocate DMU buffer to store the bv_entcount[] array.
+ * We will keep array size (bv_size) and cummulative count for all
+ * bv_entcount[]s (bv_totalcount) in the bonus buffer.
+ */
+ brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
+ DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
+ VERIFY(brtvd->bv_mos_brtvdev != 0);
+ BRT_DEBUG("MOS BRT VDEV created, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_brtvdev);
+
+ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+ (u_longlong_t)brtvd->bv_vdevid);
+ VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
+ BRT_DEBUG("Pool directory object created, object=%s", name);
+
+ spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+ vdev_t *vd;
+ uint16_t *entcount;
+ ulong_t *bitmap;
+ uint64_t nblocks, size;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+
+ spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
+ size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
+ spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
+
+ entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
+ nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
+ bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
+
+ if (!brtvd->bv_initiated) {
+ ASSERT0(brtvd->bv_size);
+ ASSERT(brtvd->bv_entcount == NULL);
+ ASSERT(brtvd->bv_bitmap == NULL);
+ ASSERT0(brtvd->bv_nblocks);
+
+ avl_create(&brtvd->bv_tree, brt_entry_compare,
+ sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
+ } else {
+ ASSERT(brtvd->bv_size > 0);
+ ASSERT(brtvd->bv_entcount != NULL);
+ ASSERT(brtvd->bv_bitmap != NULL);
+ ASSERT(brtvd->bv_nblocks > 0);
+ /*
+ * TODO: Allow vdev shrinking. We only need to implement
+ * shrinking the on-disk BRT VDEV object.
+ * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
+ * size, tx);
+ */
+ ASSERT3U(brtvd->bv_size, <=, size);
+
+ memcpy(entcount, brtvd->bv_entcount,
+ sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
+ memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
+ BT_SIZEOFMAP(brtvd->bv_nblocks)));
+ kmem_free(brtvd->bv_entcount,
+ sizeof (entcount[0]) * brtvd->bv_size);
+ kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ }
+
+ brtvd->bv_size = size;
+ brtvd->bv_entcount = entcount;
+ brtvd->bv_bitmap = bitmap;
+ brtvd->bv_nblocks = nblocks;
+ if (!brtvd->bv_initiated) {
+ brtvd->bv_need_byteswap = FALSE;
+ brtvd->bv_initiated = TRUE;
+ BRT_DEBUG("BRT VDEV %llu initiated.",
+ (u_longlong_t)brtvd->bv_vdevid);
+ }
+}
+
+static void
+brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
+{
+ char name[64];
+ dmu_buf_t *db;
+ brt_vdev_phys_t *bvphys;
+ int error;
+
+ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+ (u_longlong_t)brtvd->bv_vdevid);
+ error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
+ if (error != 0)
+ return;
+ ASSERT(brtvd->bv_mos_brtvdev != 0);
+
+ error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
+ ASSERT0(error);
+ if (error != 0)
+ return;
+
+ bvphys = db->db_data;
+ if (brt->brt_rangesize == 0) {
+ brt->brt_rangesize = bvphys->bvp_rangesize;
+ } else {
+ ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
+ }
+
+ ASSERT(!brtvd->bv_initiated);
+ brt_vdev_realloc(brt, brtvd);
+
+ /* TODO: We don't support VDEV shrinking. */
+ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
+
+ /*
+ * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
+ */
+ error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+ MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
+ brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
+ ASSERT0(error);
+
+ brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
+ ASSERT(brtvd->bv_mos_entries != 0);
+ brtvd->bv_need_byteswap =
+ (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
+ brtvd->bv_totalcount = bvphys->bvp_totalcount;
+ brtvd->bv_usedspace = bvphys->bvp_usedspace;
+ brtvd->bv_savedspace = bvphys->bvp_savedspace;
+ brt->brt_usedspace += brtvd->bv_usedspace;
+ brt->brt_savedspace += brtvd->bv_savedspace;
+
+ dmu_buf_rele(db, FTAG);
+
+ BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
+ name, (u_longlong_t)brtvd->bv_mos_brtvdev,
+ (u_longlong_t)brtvd->bv_mos_entries);
+}
+
+static void
+brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_initiated);
+
+ kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
+ brtvd->bv_entcount = NULL;
+ kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ brtvd->bv_bitmap = NULL;
+ ASSERT0(avl_numnodes(&brtvd->bv_tree));
+ avl_destroy(&brtvd->bv_tree);
+
+ brtvd->bv_size = 0;
+ brtvd->bv_nblocks = 0;
+
+ brtvd->bv_initiated = FALSE;
+ BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
+}
+
+static void
+brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+ char name[64];
+ uint64_t count;
+ dmu_buf_t *db;
+ brt_vdev_phys_t *bvphys;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_mos_brtvdev != 0);
+ ASSERT(brtvd->bv_mos_entries != 0);
+
+ VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
+ VERIFY0(count);
+ VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
+ BRT_DEBUG("MOS entries destroyed, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_entries);
+ brtvd->bv_mos_entries = 0;
+
+ VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+ bvphys = db->db_data;
+ ASSERT0(bvphys->bvp_totalcount);
+ ASSERT0(bvphys->bvp_usedspace);
+ ASSERT0(bvphys->bvp_savedspace);
+ dmu_buf_rele(db, FTAG);
+
+ VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
+ BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_brtvdev);
+ brtvd->bv_mos_brtvdev = 0;
+
+ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+ (u_longlong_t)brtvd->bv_vdevid);
+ VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+ BRT_DEBUG("Pool directory object removed, object=%s", name);
+
+ brt_vdev_dealloc(brt, brtvd);
+
+ spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
+{
+ brt_vdev_t *brtvd, *vdevs;
+ uint64_t vdevid;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT3U(nvdevs, >, brt->brt_nvdevs);
+
+ vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
+ if (brt->brt_nvdevs > 0) {
+ ASSERT(brt->brt_vdevs != NULL);
+
+ memcpy(vdevs, brt->brt_vdevs,
+ sizeof (brt_vdev_t) * brt->brt_nvdevs);
+ kmem_free(brt->brt_vdevs,
+ sizeof (brt_vdev_t) * brt->brt_nvdevs);
+ }
+ for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
+ brtvd = &vdevs[vdevid];
+
+ brtvd->bv_vdevid = vdevid;
+ brtvd->bv_initiated = FALSE;
+ }
+
+ BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
+ (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
+
+ brt->brt_vdevs = vdevs;
+ brt->brt_nvdevs = nvdevs;
+}
+
+static boolean_t
+brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
+{
+ uint64_t idx;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+ idx = bre->bre_offset / brt->brt_rangesize;
+ if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
+ /* VDEV wasn't expanded. */
+ return (brt_vdev_entcount_get(brtvd, idx) > 0);
+ }
+
+ return (FALSE);
+}
+
+static void
+brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+ uint64_t dsize)
+{
+ uint64_t idx;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+ ASSERT(brtvd != NULL);
+ ASSERT(brtvd->bv_entcount != NULL);
+
+ brt->brt_savedspace += dsize;
+ brtvd->bv_savedspace += dsize;
+ brtvd->bv_meta_dirty = TRUE;
+
+ if (bre->bre_refcount > 1) {
+ return;
+ }
+
+ brt->brt_usedspace += dsize;
+ brtvd->bv_usedspace += dsize;
+
+ idx = bre->bre_offset / brt->brt_rangesize;
+ if (idx >= brtvd->bv_size) {
+ /* VDEV has been expanded. */
+ brt_vdev_realloc(brt, brtvd);
+ }
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ brtvd->bv_totalcount++;
+ brt_vdev_entcount_inc(brtvd, idx);
+ brtvd->bv_entcount_dirty = TRUE;
+ idx = idx / BRT_BLOCKSIZE / 8;
+ BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+ brt_vdev_dump(brt);
+#endif
+}
+
+static void
+brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+ uint64_t dsize)
+{
+ uint64_t idx;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd != NULL);
+ ASSERT(brtvd->bv_entcount != NULL);
+
+ brt->brt_savedspace -= dsize;
+ brtvd->bv_savedspace -= dsize;
+ brtvd->bv_meta_dirty = TRUE;
+
+ if (bre->bre_refcount > 0) {
+ return;
+ }
+
+ brt->brt_usedspace -= dsize;
+ brtvd->bv_usedspace -= dsize;
+
+ idx = bre->bre_offset / brt->brt_rangesize;
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ ASSERT(brtvd->bv_totalcount > 0);
+ brtvd->bv_totalcount--;
+ brt_vdev_entcount_dec(brtvd, idx);
+ brtvd->bv_entcount_dirty = TRUE;
+ idx = idx / BRT_BLOCKSIZE / 8;
+ BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+ brt_vdev_dump(brt);
+#endif
+}
+
+static void
+brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ brt_vdev_phys_t *bvphys;
+
+ ASSERT(brtvd->bv_meta_dirty);
+ ASSERT(brtvd->bv_mos_brtvdev != 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+
+ if (brtvd->bv_entcount_dirty) {
+ /*
+ * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
+ */
+ dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+ brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
+ brtvd->bv_entcount, tx);
+ memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ brtvd->bv_entcount_dirty = FALSE;
+ }
+
+ dmu_buf_will_dirty(db, tx);
+ bvphys = db->db_data;
+ bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
+ bvphys->bvp_size = brtvd->bv_size;
+ if (brtvd->bv_need_byteswap) {
+ bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
+ } else {
+ bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
+ }
+ bvphys->bvp_totalcount = brtvd->bv_totalcount;
+ bvphys->bvp_rangesize = brt->brt_rangesize;
+ bvphys->bvp_usedspace = brtvd->bv_usedspace;
+ bvphys->bvp_savedspace = brtvd->bv_savedspace;
+ dmu_buf_rele(db, FTAG);
+
+ brtvd->bv_meta_dirty = FALSE;
+}
+
+static void
+brt_vdevs_alloc(brt_t *brt, boolean_t load)
+{
+ brt_vdev_t *brtvd;
+ uint64_t vdevid;
+
+ brt_wlock(brt);
+
+ brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
+
+ if (load) {
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ brtvd = &brt->brt_vdevs[vdevid];
+ ASSERT(brtvd->bv_entcount == NULL);
+
+ brt_vdev_load(brt, brtvd);
+ }
+ }
+
+ if (brt->brt_rangesize == 0) {
+ brt->brt_rangesize = BRT_RANGESIZE;
+ }
+
+ brt_unlock(brt);
+}
+
+static void
+brt_vdevs_free(brt_t *brt)
+{
+ brt_vdev_t *brtvd;
+ uint64_t vdevid;
+
+ brt_wlock(brt);
+
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ brtvd = &brt->brt_vdevs[vdevid];
+ if (brtvd->bv_initiated)
+ brt_vdev_dealloc(brt, brtvd);
+ }
+ kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
+
+ brt_unlock(brt);
+}
+
+static void
+brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
+{
+
+ bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+ bre->bre_refcount = 0;
+
+ *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
+}
+
+static int
+brt_entry_compare(const void *x1, const void *x2)
+{
+ const brt_entry_t *bre1 = x1;
+ const brt_entry_t *bre2 = x2;
+
+ return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
+}
+
+static int
+brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
+{
+ uint64_t mos_entries;
+ uint64_t one, physsize;
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+ if (!brt_vdev_lookup(brt, brtvd, bre))
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * Remember mos_entries object number. After we reacquire the BRT lock,
+ * the brtvd pointer may be invalid.
+ */
+ mos_entries = brtvd->bv_mos_entries;
+ if (mos_entries == 0)
+ return (SET_ERROR(ENOENT));
+
+ brt_unlock(brt);
+
+ error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+ BRT_KEY_WORDS, &one, &physsize);
+ if (error == 0) {
+ ASSERT3U(one, ==, 1);
+ ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
+
+ error = zap_lookup_uint64(brt->brt_mos, mos_entries,
+ &bre->bre_offset, BRT_KEY_WORDS, 1,
+ sizeof (bre->bre_refcount), &bre->bre_refcount);
+ BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
+ "count=%llu error=%d", (u_longlong_t)mos_entries,
+ (u_longlong_t)brtvd->bv_vdevid,
+ (u_longlong_t)bre->bre_offset,
+ error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
+ }
+
+ brt_wlock(brt);
+
+ return (error);
+}
+
+static void
+brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
+{
+ brt_vdev_t *brtvd;
+ uint64_t mos_entries = 0;
+
+ brt_rlock(brt);
+ brtvd = brt_vdev(brt, vdevid);
+ if (brtvd != NULL)
+ mos_entries = brtvd->bv_mos_entries;
+ brt_unlock(brt);
+
+ if (mos_entries == 0)
+ return;
+
+ BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
+ (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
+ (u_longlong_t)bre->bre_offset);
+ (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
+ (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
+}
+
+static int
+brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_mos_entries != 0);
+ ASSERT(bre->bre_refcount > 0);
+
+ error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
+ (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
+ sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
+ BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
+ "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
+ (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
+ (u_longlong_t)bre->bre_refcount, error);
+
+ return (error);
+}
+
+static int
+brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_mos_entries != 0);
+ ASSERT0(bre->bre_refcount);
+
+ error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
+ (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
+ BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
+ "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
+ (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
+ (u_longlong_t)bre->bre_refcount, error);
+
+ return (error);
+}
+
+/*
+ * Return TRUE if we _can_ have BRT entry for this bp. It might be false
+ * positive, but gives us quick answer if we should look into BRT, which
+ * may require reads and thus will be more expensive.
+ */
+boolean_t
+brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
+{
+ brt_t *brt = spa->spa_brt;
+ brt_vdev_t *brtvd;
+ brt_entry_t bre_search;
+ boolean_t mayexists = FALSE;
+ uint64_t vdevid;
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_rlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ if (brtvd != NULL && brtvd->bv_initiated) {
+ if (!avl_is_empty(&brtvd->bv_tree) ||
+ brt_vdev_lookup(brt, brtvd, &bre_search)) {
+ mayexists = TRUE;
+ }
+ }
+
+ brt_unlock(brt);
+
+ return (mayexists);
+}
+
+uint64_t
+brt_get_dspace(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return (0);
+
+ return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_used(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return (0);
+
+ return (brt->brt_usedspace);
+}
+
+uint64_t
+brt_get_saved(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return (0);
+
+ return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_ratio(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt->brt_usedspace == 0)
+ return (100);
+
+ return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
+ brt->brt_usedspace);
+}
+
+static int
+brt_kstats_update(kstat_t *ksp, int rw)
+{
+ brt_stats_t *bs = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ bs->brt_addref_entry_in_memory.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_in_memory);
+ bs->brt_addref_entry_not_on_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
+ bs->brt_addref_entry_on_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_on_disk);
+ bs->brt_addref_entry_read_lost_race.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
+ bs->brt_decref_entry_in_memory.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_in_memory);
+ bs->brt_decref_entry_loaded_from_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
+ bs->brt_decref_entry_not_in_memory.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
+ bs->brt_decref_entry_not_on_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
+ bs->brt_decref_entry_read_lost_race.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
+ bs->brt_decref_entry_still_referenced.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
+ bs->brt_decref_free_data_later.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_free_data_later);
+ bs->brt_decref_free_data_now.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_free_data_now);
+ bs->brt_decref_no_entry.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_no_entry);
+
+ return (0);
+}
+
+static void
+brt_stat_init(void)
+{
+
+ wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
+ wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
+ wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
+ wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
+ wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
+ wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
+ wmsum_init(&brt_sums.brt_decref_no_entry, 0);
+
+ brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (brt_ksp != NULL) {
+ brt_ksp->ks_data = &brt_stats;
+ brt_ksp->ks_update = brt_kstats_update;
+ kstat_install(brt_ksp);
+ }
+}
+
+static void
+brt_stat_fini(void)
+{
+ if (brt_ksp != NULL) {
+ kstat_delete(brt_ksp);
+ brt_ksp = NULL;
+ }
+
+ wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
+ wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
+ wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
+ wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
+ wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
+ wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
+ wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
+ wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
+ wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
+ wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
+ wmsum_fini(&brt_sums.brt_decref_free_data_later);
+ wmsum_fini(&brt_sums.brt_decref_free_data_now);
+ wmsum_fini(&brt_sums.brt_decref_no_entry);
+}
+
+void
+brt_init(void)
+{
+ brt_entry_cache = kmem_cache_create("brt_entry_cache",
+ sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
+ sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ brt_stat_init();
+}
+
+void
+brt_fini(void)
+{
+ brt_stat_fini();
+
+ kmem_cache_destroy(brt_entry_cache);
+ kmem_cache_destroy(brt_pending_entry_cache);
+}
+
+static brt_entry_t *
+brt_entry_alloc(const brt_entry_t *bre_init)
+{
+ brt_entry_t *bre;
+
+ bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
+ bre->bre_offset = bre_init->bre_offset;
+ bre->bre_refcount = bre_init->bre_refcount;
+
+ return (bre);
+}
+
+static void
+brt_entry_free(brt_entry_t *bre)
+{
+
+ kmem_cache_free(brt_entry_cache, bre);
+}
+
+static void
+brt_entry_addref(brt_t *brt, const blkptr_t *bp)
+{
+ brt_vdev_t *brtvd;
+ brt_entry_t *bre, *racebre;
+ brt_entry_t bre_search;
+ avl_index_t where;
+ uint64_t vdevid;
+ int error;
+
+ ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_wlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ if (brtvd == NULL) {
+ ASSERT3U(vdevid, >=, brt->brt_nvdevs);
+
+ /* New VDEV was added. */
+ brt_vdevs_expand(brt, vdevid + 1);
+ brtvd = brt_vdev(brt, vdevid);
+ }
+ ASSERT(brtvd != NULL);
+ if (!brtvd->bv_initiated)
+ brt_vdev_realloc(brt, brtvd);
+
+ bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+ if (bre != NULL) {
+ BRTSTAT_BUMP(brt_addref_entry_in_memory);
+ } else {
+ /*
+ * brt_entry_lookup() may drop the BRT (read) lock and
+ * reacquire it (write).
+ */
+ error = brt_entry_lookup(brt, brtvd, &bre_search);
+ /* bre_search now contains correct bre_refcount */
+ ASSERT(error == 0 || error == ENOENT);
+ if (error == 0)
+ BRTSTAT_BUMP(brt_addref_entry_on_disk);
+ else
+ BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
+ /*
+ * When the BRT lock was dropped, brt_vdevs[] may have been
+ * expanded and reallocated, we need to update brtvd's pointer.
+ */
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+ if (racebre == NULL) {
+ bre = brt_entry_alloc(&bre_search);
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ avl_insert(&brtvd->bv_tree, bre, where);
+ brt->brt_nentries++;
+ } else {
+ /*
+ * The entry was added when the BRT lock was dropped in
+ * brt_entry_lookup().
+ */
+ BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
+ bre = racebre;
+ }
+ }
+ bre->bre_refcount++;
+ brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+ brt_unlock(brt);
+}
+
+/* Return TRUE if block should be freed immediately. */
+boolean_t
+brt_entry_decref(spa_t *spa, const blkptr_t *bp)
+{
+ brt_t *brt = spa->spa_brt;
+ brt_vdev_t *brtvd;
+ brt_entry_t *bre, *racebre;
+ brt_entry_t bre_search;
+ avl_index_t where;
+ uint64_t vdevid;
+ int error;
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_wlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+ if (bre != NULL) {
+ BRTSTAT_BUMP(brt_decref_entry_in_memory);
+ goto out;
+ } else {
+ BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
+ }
+
+ /*
+ * brt_entry_lookup() may drop the BRT lock and reacquire it.
+ */
+ error = brt_entry_lookup(brt, brtvd, &bre_search);
+ /* bre_search now contains correct bre_refcount */
+ ASSERT(error == 0 || error == ENOENT);
+ /*
+ * When the BRT lock was dropped, brt_vdevs[] may have been expanded
+ * and reallocated, we need to update brtvd's pointer.
+ */
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ if (error == ENOENT) {
+ BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
+ bre = NULL;
+ goto out;
+ }
+
+ racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+ if (racebre != NULL) {
+ /*
+ * The entry was added when the BRT lock was dropped in
+ * brt_entry_lookup().
+ */
+ BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
+ bre = racebre;
+ goto out;
+ }
+
+ BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
+ bre = brt_entry_alloc(&bre_search);
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ avl_insert(&brtvd->bv_tree, bre, where);
+ brt->brt_nentries++;
+
+out:
+ if (bre == NULL) {
+ /*
+ * This is a free of a regular (not cloned) block.
+ */
+ brt_unlock(brt);
+ BRTSTAT_BUMP(brt_decref_no_entry);
+ return (B_TRUE);
+ }
+ if (bre->bre_refcount == 0) {
+ brt_unlock(brt);
+ BRTSTAT_BUMP(brt_decref_free_data_now);
+ return (B_TRUE);
+ }
+
+ ASSERT(bre->bre_refcount > 0);
+ bre->bre_refcount--;
+ if (bre->bre_refcount == 0)
+ BRTSTAT_BUMP(brt_decref_free_data_later);
+ else
+ BRTSTAT_BUMP(brt_decref_entry_still_referenced);
+ brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+ brt_unlock(brt);
+
+ return (B_FALSE);
+}
+
+static void
+brt_prefetch(brt_t *brt, const blkptr_t *bp)
+{
+ brt_entry_t bre;
+ uint64_t vdevid;
+
+ ASSERT(bp != NULL);
+
+ if (!zfs_brt_prefetch)
+ return;
+
+ brt_entry_fill(bp, &bre, &vdevid);
+
+ brt_entry_prefetch(brt, vdevid, &bre);
+}
+
+static int
+brt_pending_entry_compare(const void *x1, const void *x2)
+{
+ const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
+ const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
+ int cmp;
+
+ cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+ if (cmp == 0) {
+ cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+ DVA_GET_VDEV(&bp2->blk_dva[0]));
+ if (cmp == 0) {
+ cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+ DVA_GET_OFFSET(&bp2->blk_dva[0]));
+ }
+ }
+
+ return (cmp);
+}
+
+void
+brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ brt_t *brt;
+ avl_tree_t *pending_tree;
+ kmutex_t *pending_lock;
+ brt_pending_entry_t *bpe, *newbpe;
+ avl_index_t where;
+ uint64_t txg;
+
+ brt = spa->spa_brt;
+ txg = dmu_tx_get_txg(tx);
+ ASSERT3U(txg, !=, 0);
+ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+ pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+ newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
+ newbpe->bpe_bp = *bp;
+ newbpe->bpe_count = 1;
+
+ mutex_enter(pending_lock);
+
+ bpe = avl_find(pending_tree, newbpe, &where);
+ if (bpe == NULL) {
+ avl_insert(pending_tree, newbpe, where);
+ newbpe = NULL;
+ } else {
+ bpe->bpe_count++;
+ }
+
+ mutex_exit(pending_lock);
+
+ if (newbpe != NULL) {
+ ASSERT(bpe != NULL);
+ ASSERT(bpe != newbpe);
+ kmem_cache_free(brt_pending_entry_cache, newbpe);
+ } else {
+ ASSERT(bpe == NULL);
+ }
+
+ /* Prefetch BRT entry, as we will need it in the syncing context. */
+ brt_prefetch(brt, bp);
+}
+
+void
+brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ brt_t *brt;
+ avl_tree_t *pending_tree;
+ kmutex_t *pending_lock;
+ brt_pending_entry_t *bpe, bpe_search;
+ uint64_t txg;
+
+ brt = spa->spa_brt;
+ txg = dmu_tx_get_txg(tx);
+ ASSERT3U(txg, !=, 0);
+ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+ pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+ bpe_search.bpe_bp = *bp;
+
+ mutex_enter(pending_lock);
+
+ bpe = avl_find(pending_tree, &bpe_search, NULL);
+ /* I believe we should always find bpe when this function is called. */
+ if (bpe != NULL) {
+ ASSERT(bpe->bpe_count > 0);
+
+ bpe->bpe_count--;
+ if (bpe->bpe_count == 0) {
+ avl_remove(pending_tree, bpe);
+ kmem_cache_free(brt_pending_entry_cache, bpe);
+ }
+ }
+
+ mutex_exit(pending_lock);
+}
+
+void
+brt_pending_apply(spa_t *spa, uint64_t txg)
+{
+ brt_t *brt;
+ brt_pending_entry_t *bpe;
+ avl_tree_t *pending_tree;
+ kmutex_t *pending_lock;
+ void *c;
+
+ ASSERT3U(txg, !=, 0);
+
+ brt = spa->spa_brt;
+ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+ pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+ mutex_enter(pending_lock);
+
+ c = NULL;
+ while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
+ boolean_t added_to_ddt;
+
+ mutex_exit(pending_lock);
+
+ for (int i = 0; i < bpe->bpe_count; i++) {
+ /*
+ * If the block has DEDUP bit set, it means that it
+ * already exists in the DEDUP table, so we can just
+ * use that instead of creating new entry in
+ * the BRT table.
+ */
+ if (BP_GET_DEDUP(&bpe->bpe_bp)) {
+ added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
+ } else {
+ added_to_ddt = B_FALSE;
+ }
+ if (!added_to_ddt)
+ brt_entry_addref(brt, &bpe->bpe_bp);
+ }
+
+ kmem_cache_free(brt_pending_entry_cache, bpe);
+ mutex_enter(pending_lock);
+ }
+
+ mutex_exit(pending_lock);
+}
+
+static void
+brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_mos_entries != 0);
+
+ if (bre->bre_refcount == 0) {
+ int error;
+
+ error = brt_entry_remove(brt, brtvd, bre, tx);
+ ASSERT(error == 0 || error == ENOENT);
+ /*
+ * If error == ENOENT then zfs_clone_range() was done from a
+ * removed (but opened) file (open(), unlink()).
+ */
+ ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+ } else {
+ VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+ }
+}
+
+static void
+brt_sync_table(brt_t *brt, dmu_tx_t *tx)
+{
+ brt_vdev_t *brtvd;
+ brt_entry_t *bre;
+ uint64_t vdevid;
+ void *c;
+
+ brt_wlock(brt);
+
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ brtvd = &brt->brt_vdevs[vdevid];
+
+ if (!brtvd->bv_initiated)
+ continue;
+
+ if (!brtvd->bv_meta_dirty) {
+ ASSERT(!brtvd->bv_entcount_dirty);
+ ASSERT0(avl_numnodes(&brtvd->bv_tree));
+ continue;
+ }
+
+ ASSERT(!brtvd->bv_entcount_dirty ||
+ avl_numnodes(&brtvd->bv_tree) != 0);
+
+ if (brtvd->bv_mos_brtvdev == 0)
+ brt_vdev_create(brt, brtvd, tx);
+
+ c = NULL;
+ while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
+ brt_sync_entry(brt, brtvd, bre, tx);
+ brt_entry_free(bre);
+ ASSERT(brt->brt_nentries > 0);
+ brt->brt_nentries--;
+ }
+
+ brt_vdev_sync(brt, brtvd, tx);
+
+ if (brtvd->bv_totalcount == 0)
+ brt_vdev_destroy(brt, brtvd, tx);
+ }
+
+ ASSERT0(brt->brt_nentries);
+
+ brt_unlock(brt);
+}
+
+void
+brt_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ brt_t *brt;
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ brt = spa->spa_brt;
+ brt_rlock(brt);
+ if (brt->brt_nentries == 0) {
+ /* No changes. */
+ brt_unlock(brt);
+ return;
+ }
+ brt_unlock(brt);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ brt_sync_table(brt, tx);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+brt_table_alloc(brt_t *brt)
+{
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ avl_create(&brt->brt_pending_tree[i],
+ brt_pending_entry_compare,
+ sizeof (brt_pending_entry_t),
+ offsetof(brt_pending_entry_t, bpe_node));
+ mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
+ NULL);
+ }
+}
+
+static void
+brt_table_free(brt_t *brt)
+{
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
+
+ avl_destroy(&brt->brt_pending_tree[i]);
+ mutex_destroy(&brt->brt_pending_lock[i]);
+ }
+}
+
+static void
+brt_alloc(spa_t *spa)
+{
+ brt_t *brt;
+
+ ASSERT(spa->spa_brt == NULL);
+
+ brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
+ rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
+ brt->brt_spa = spa;
+ brt->brt_rangesize = 0;
+ brt->brt_nentries = 0;
+ brt->brt_vdevs = NULL;
+ brt->brt_nvdevs = 0;
+ brt_table_alloc(brt);
+
+ spa->spa_brt = brt;
+}
+
+void
+brt_create(spa_t *spa)
+{
+
+ brt_alloc(spa);
+ brt_vdevs_alloc(spa->spa_brt, B_FALSE);
+}
+
+int
+brt_load(spa_t *spa)
+{
+
+ brt_alloc(spa);
+ brt_vdevs_alloc(spa->spa_brt, B_TRUE);
+
+ return (0);
+}
+
+void
+brt_unload(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return;
+
+ brt_vdevs_free(brt);
+ brt_table_free(brt);
+ rw_destroy(&brt->brt_lock);
+ kmem_free(brt, sizeof (*brt));
+ spa->spa_brt = NULL;
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
+ "Enable prefetching of BRT entries");
+#ifdef ZFS_BRT_DEBUG
+ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
+#endif
+/* END CSTYLED */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 191e5e043..94c2ae9d7 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -26,6 +26,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/zfs_context.h>
@@ -49,6 +50,7 @@
#include <sys/trace_zfs.h>
#include <sys/callb.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/vdev.h>
#include <cityhash.h>
#include <sys/spa_impl.h>
@@ -1427,7 +1429,7 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
}
static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
{
blkptr_t *bps = db->db.db_data;
uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1436,12 +1438,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
for (int i = 0; i < n_bps; i++) {
blkptr_t *bp = &bps[i];
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
- BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
- dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
- BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
- BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
- BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+ BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+ dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+ BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+ BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+ BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
}
}
@@ -1451,30 +1453,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
* was taken, ENOENT if no action was taken.
*/
static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+ int is_hole = bp == NULL || BP_IS_HOLE(bp);
/*
* For level 0 blocks only, if the above check fails:
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
* processes the delete record and clears the bp while we are waiting
* for the dn_mtx (resulting in a "no" from block_freed).
*/
- if (!is_hole && db->db_level == 0) {
- is_hole = dnode_block_freed(dn, db->db_blkid) ||
- BP_IS_HOLE(db->db_blkptr);
- }
+ if (!is_hole && db->db_level == 0)
+ is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
if (is_hole) {
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
memset(db->db.db_data, 0, db->db.db_size);
- if (db->db_blkptr != NULL && db->db_level > 0 &&
- BP_IS_HOLE(db->db_blkptr) &&
- db->db_blkptr->blk_birth != 0) {
- dbuf_handle_indirect_hole(db, dn);
+ if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+ bp->blk_birth != 0) {
+ dbuf_handle_indirect_hole(db, dn, bp);
}
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1551,12 +1550,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
+ blkptr_t bp, *bpp;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_buf == NULL);
ASSERT(db->db_parent == NULL ||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1566,16 +1566,46 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
goto early_unlock;
}
- err = dbuf_read_hole(db, dn);
+ if (db->db_state == DB_UNCACHED) {
+ if (db->db_blkptr == NULL) {
+ bpp = NULL;
+ } else {
+ bp = *db->db_blkptr;
+ bpp = &bp;
+ }
+ } else {
+ struct dirty_leaf *dl;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+ dr = list_head(&db->db_dirty_records);
+ if (dr == NULL) {
+ err = EIO;
+ goto early_unlock;
+ } else {
+ dl = &dr->dt.dl;
+ if (!dl->dr_brtwrite) {
+ err = EIO;
+ goto early_unlock;
+ }
+ bp = dl->dr_overridden_by;
+ bpp = &bp;
+ }
+ }
+
+ err = dbuf_read_hole(db, dn, bpp);
if (err == 0)
goto early_unlock;
+ ASSERT(bpp != NULL);
+
/*
* Any attempt to read a redacted block should result in an error. This
* will never happen under normal conditions, but can be useful for
* debugging purposes.
*/
- if (BP_IS_REDACTED(db->db_blkptr)) {
+ if (BP_IS_REDACTED(bpp)) {
ASSERT(dsl_dataset_feature_is_active(
db->db_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
@@ -1590,7 +1620,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
- if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
spa_log_error(db->db_objset->os_spa, &zb);
zfs_panic_recover("unencrypted block in encrypted "
"object set %llu", dmu_objset_id(db->db_objset));
@@ -1621,15 +1651,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;
/*
- * The zio layer will copy the provided blkptr later, but we need to
- * do this now so that we can release the parent's rwlock. We have to
- * do that now so that if dbuf_read_done is called synchronously (on
+ * The zio layer will copy the provided blkptr later, but we have our
+ * own copy so that we can release the parent's rwlock. We have to
+ * do that so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
- blkptr_t bp = *db->db_blkptr;
dmu_buf_unlock_parent(db, dblt, tag);
- (void) arc_read(zio, db->db_objset->os_spa, &bp,
+ (void) arc_read(zio, db->db_objset->os_spa, bpp,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
@@ -1731,9 +1760,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- if (db->db_state == DB_NOFILL)
- return (SET_ERROR(EIO));
-
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -1780,13 +1806,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
- } else if (db->db_state == DB_UNCACHED) {
+ } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
boolean_t need_wait = B_FALSE;
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
- if (zio == NULL &&
- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+ if (zio == NULL && (db->db_state == DB_NOFILL ||
+ (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
spa_t *spa = dn->dn_objset->os_spa;
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE;
@@ -1913,7 +1939,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
* the buf thawed to save the effort of freezing &
* immediately re-thawing it.
*/
- arc_release(dr->dt.dl.dr_data, db);
+ if (!dr->dt.dl.dr_brtwrite)
+ arc_release(dr->dt.dl.dr_data, db);
}
/*
@@ -1996,6 +2023,11 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
db->db_blkid > dn->dn_maxblkid)
dn->dn_maxblkid = db->db_blkid;
dbuf_unoverride(dr);
+ if (dr->dt.dl.dr_brtwrite) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
} else {
/*
* This dbuf is not dirty in the open context.
@@ -2285,7 +2317,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DMU_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dmu_objset_willuse_space(os, db->db.db_size, tx);
}
@@ -2328,8 +2360,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
- if (db->db_blkid != DMU_BONUS_BLKID)
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dr->dr_accounted = db->db.db_size;
+ }
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2489,6 +2522,7 @@ static boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
+ boolean_t brtwrite;
ASSERT(txg != 0);
@@ -2513,6 +2547,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
ASSERT(dr->dr_dbuf == db);
+ brtwrite = dr->dt.dl.dr_brtwrite;
+ if (brtwrite) {
+ /*
+ * We are freeing a block that we cloned in the same
+ * transaction group.
+ */
+ brt_pending_remove(dmu_objset_spa(db->db_objset),
+ &dr->dt.dl.dr_overridden_by, tx);
+ }
+
dnode_t *dn = dr->dr_dnode;
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2542,7 +2586,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
}
- if (db->db_state != DB_NOFILL) {
+ if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);
ASSERT(db->db_buf != NULL);
@@ -2557,7 +2601,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+ arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
@@ -4748,8 +4793,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
- if (dr->dt.dl.dr_data != db->db_buf)
+ if (dr->dt.dl.dr_data != NULL &&
+ dr->dt.dl.dr_data != db->db_buf) {
arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
}
} else {
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -5046,7 +5093,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
- dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+ dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 7880a899a..33fea0ba3 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
*/
#include <sys/zfs_context.h>
@@ -1180,5 +1181,59 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
return (SET_ERROR(ENOENT));
}
+/*
+ * This function is used by Block Cloning (brt.c) to increase reference
+ * counter for the DDT entry if the block is already in DDT.
+ *
+ * Return false if the block, despite having the D bit set, is not present
+ * in the DDT. Currently this is not possible but might be in the future.
+ * See the comment below.
+ */
+boolean_t
+ddt_addref(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+ boolean_t result;
+
+ spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+ ddt = ddt_select(spa, bp);
+ ddt_enter(ddt);
+
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ASSERT(dde != NULL);
+
+ if (dde->dde_type < DDT_TYPES) {
+ ddt_phys_t *ddp;
+
+ ASSERT3S(dde->dde_class, <, DDT_CLASSES);
+
+ ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+ if (ddp->ddp_refcnt == 0) {
+ /* This should never happen? */
+ ddt_phys_fill(ddp, bp);
+ }
+ ddt_phys_addref(ddp);
+ result = B_TRUE;
+ } else {
+ /*
+ * At the time of implementating this if the block has the
+ * DEDUP flag set it must exist in the DEDUP table, but
+ * there are many advocates that want ability to remove
+ * entries from DDT with refcnt=1. If this will happen,
+ * we may have a block with the DEDUP set, but which doesn't
+ * have a corresponding entry in the DDT. Be ready.
+ */
+ ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
+ ddt_remove(ddt, dde);
+ result = B_FALSE;
+ }
+
+ ddt_exit(ddt);
+ spa_config_exit(spa, SCL_ZIO, FTAG);
+
+ return (result);
+}
+
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
"Enable prefetching dedup-ed blks");
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 9b8fc7e49..e6bade11c 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -29,6 +29,7 @@
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/dmu.h>
@@ -52,6 +53,7 @@
#include <sys/sa.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/trace_zfs.h>
#include <sys/zfs_racct.h>
#include <sys/zfs_rlock.h>
@@ -513,7 +515,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio_t *zio = NULL;
boolean_t missed = B_FALSE;
- ASSERT(length <= DMU_MAX_ACCESS);
+ ASSERT(!read || length <= DMU_MAX_ACCESS);
/*
* Note: We directly notify the prefetch code of this read, so that
@@ -2165,6 +2167,155 @@ restart:
return (err);
}
+int
+dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+ dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp)
+{
+ dmu_buf_t **dbp, *dbuf;
+ dmu_buf_impl_t *db;
+ blkptr_t *bp;
+ int error, numbufs;
+
+ error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+ &numbufs, &dbp);
+ if (error != 0) {
+ if (error == ESRCH) {
+ error = SET_ERROR(ENXIO);
+ }
+ return (error);
+ }
+
+ ASSERT3U(numbufs, <=, *nbpsp);
+
+ for (int i = 0; i < numbufs; i++) {
+ dbuf = dbp[i];
+ db = (dmu_buf_impl_t *)dbuf;
+ bp = db->db_blkptr;
+
+ /*
+ * If the block is not on the disk yet, it has no BP assigned.
+ * There is not much we can do...
+ */
+ if (!list_is_empty(&db->db_dirty_records)) {
+ dbuf_dirty_record_t *dr;
+
+ dr = list_head(&db->db_dirty_records);
+ if (dr->dt.dl.dr_brtwrite) {
+ /*
+ * This is very special case where we clone a
+ * block and in the same transaction group we
+ * read its BP (most likely to clone the clone).
+ */
+ bp = &dr->dt.dl.dr_overridden_by;
+ } else {
+ /*
+ * The block was modified in the same
+ * transaction group.
+ */
+ error = SET_ERROR(EAGAIN);
+ goto out;
+ }
+ }
+ if (bp == NULL) {
+ /*
+ * The block was created in this transaction group,
+ * so it has no BP yet.
+ */
+ error = SET_ERROR(EAGAIN);
+ goto out;
+ }
+ if (dmu_buf_is_dirty(dbuf, tx)) {
+ error = SET_ERROR(EAGAIN);
+ goto out;
+ }
+ /*
+ * Make sure we clone only data blocks.
+ */
+ if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ bps[i] = *bp;
+ }
+
+ *nbpsp = numbufs;
+out:
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (error);
+}
+
+void
+dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+ dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+{
+ spa_t *spa;
+ dmu_buf_t **dbp, *dbuf;
+ dmu_buf_impl_t *db;
+ struct dirty_leaf *dl;
+ dbuf_dirty_record_t *dr;
+ const blkptr_t *bp;
+ int numbufs;
+
+ spa = os->os_spa;
+
+ VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+ &numbufs, &dbp));
+ ASSERT3U(nbps, ==, numbufs);
+
+ for (int i = 0; i < numbufs; i++) {
+ dbuf = dbp[i];
+ db = (dmu_buf_impl_t *)dbuf;
+ bp = &bps[i];
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
+
+ if (db->db_state == DB_UNCACHED) {
+ /*
+ * XXX-PJD: If the dbuf is already cached, calling
+ * dmu_buf_will_not_fill() will panic on assertion
+ * (db->db_buf == NULL) in dbuf_clear_data(),
+ * which is called from dbuf_noread() in DB_NOFILL
+ * case. I'm not 100% sure this is the right thing
+ * to do, but it seems to work.
+ */
+ dmu_buf_will_not_fill(dbuf, tx);
+ }
+
+ dr = list_head(&db->db_dirty_records);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dl = &dr->dt.dl;
+ dl->dr_overridden_by = *bp;
+ dl->dr_brtwrite = B_TRUE;
+
+ dl->dr_override_state = DR_OVERRIDDEN;
+ if (BP_IS_HOLE(bp)) {
+ dl->dr_overridden_by.blk_birth = 0;
+ dl->dr_overridden_by.blk_phys_birth = 0;
+ } else {
+ dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ dl->dr_overridden_by.blk_phys_birth =
+ BP_PHYSICAL_BIRTH(bp);
+ }
+
+ /*
+ * When data in embedded into BP there is no need to create
+ * BRT entry as there is no data block. Just copy the BP as
+ * it contains the data.
+ * Also, when replaying ZIL we don't want to bump references
+ * in the BRT as it was already done during ZIL claim.
+ */
+ if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+ brt_pending_add(spa, bp, tx);
+ }
+ }
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
void
__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 815e27a6c..1c5608c45 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -349,7 +349,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx)
}
static void
-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
dmu_tx_t *tx = txh->txh_tx;
dnode_t *dn = txh->txh_dnode;
@@ -357,15 +357,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
ASSERT(tx->tx_txg == 0);
- dmu_tx_count_dnode(txh);
-
if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
- dmu_tx_count_dnode(txh);
-
/*
* For i/o error checking, we read the first and last level-0
* blocks if they are not aligned, and all the level-1 blocks.
@@ -445,8 +441,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
object, THT_FREE, off, len);
- if (txh != NULL)
- (void) dmu_tx_hold_free_impl(txh, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+ }
}
void
@@ -455,8 +453,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
dmu_tx_hold_t *txh;
txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
- if (txh != NULL)
- (void) dmu_tx_hold_free_impl(txh, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+ }
+}
+
+static void
+dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+
+ /*
+ * Reuse dmu_tx_count_free(), it does exactly what we need for clone.
+ */
+ dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_clone(txh, off, len);
+ }
}
static void
@@ -667,6 +692,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
case THT_NEWOBJECT:
match_object = TRUE;
break;
+ case THT_CLONE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ break;
default:
cmn_err(CE_PANIC, "bad txh_type %d",
txh->txh_type);
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 021cba68c..8e3fd126c 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -47,6 +47,7 @@
#include <sys/vdev_impl.h>
#include <sys/zil_impl.h>
#include <sys/zio_checksum.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
@@ -3499,11 +3500,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
scn->scn_dedup_frees_this_txg = 0;
/*
- * Write out changes to the DDT that may be required as a
- * result of the blocks freed. This ensures that the DDT
- * is clean when a scrub/resilver runs.
+ * Write out changes to the DDT and the BRT that may be required
+ * as a result of the blocks freed. This ensures that the DDT
+ * and the BRT are clean when a scrub/resilver runs.
*/
ddt_sync(spa, tx->tx_txg);
+ brt_sync(spa, tx->tx_txg);
}
if (err != 0)
return (err);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 6be6fe115..98a302237 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -52,6 +52,7 @@
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_removal.h>
@@ -341,6 +342,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
ddt_get_pool_dedup_ratio(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
+ brt_get_used(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
+ brt_get_saved(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
+ brt_get_ratio(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
rvd->vdev_state, src);
@@ -1707,6 +1714,7 @@ spa_unload(spa_t *spa)
}
ddt_unload(spa);
+ brt_unload(spa);
spa_unload_log_sm_metadata(spa);
/*
@@ -4415,6 +4423,21 @@ spa_ld_load_dedup_tables(spa_t *spa)
}
static int
+spa_ld_load_brt(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ error = brt_load(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "brt_load failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
{
vdev_t *rvd = spa->spa_root_vdev;
@@ -4895,6 +4918,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
if (error != 0)
return (error);
+ error = spa_ld_load_brt(spa);
+ if (error != 0)
+ return (error);
+
/*
* Verify the logs now to make sure we don't have any unexpected errors
* when we claim log blocks later.
@@ -5963,6 +5990,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
* Create DDTs (dedup tables).
*/
ddt_create(spa);
+ /*
+ * Create BRT table and BRT table object.
+ */
+ brt_create(spa);
spa_update_dspace(spa);
@@ -9138,6 +9169,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
&spa->spa_deferred_bpobj, tx);
}
+ brt_sync(spa, txg);
ddt_sync(spa, txg);
dsl_scan_sync(dp, tx);
svr_sync(spa, tx);
@@ -9263,6 +9295,13 @@ spa_sync(spa_t *spa, uint64_t txg)
ZIO_FLAG_CANFAIL);
/*
+ * Now that there can be no more cloning in this transaction group,
+ * but we are still before issuing frees, we can process pending BRT
+ * updates.
+ */
+ brt_pending_apply(spa, txg);
+
+ /*
* Lock out configuration changes.
*/
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 53763e915..8466fa80e 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -57,6 +57,7 @@
#include <sys/fs/zfs.h>
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/kstat.h>
#include "zfs_prop.h"
@@ -1834,7 +1835,7 @@ void
spa_update_dspace(spa_t *spa)
{
spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
- ddt_get_dedup_dspace(spa);
+ ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
if (spa->spa_nonallocating_dspace > 0) {
/*
* Subtract the space provided by all non-allocating vdevs that
@@ -2410,6 +2411,7 @@ spa_init(spa_mode_t mode)
unique_init();
zfs_btree_init();
metaslab_stat_init();
+ brt_init();
ddt_init();
zio_init();
dmu_init();
@@ -2446,6 +2448,7 @@ spa_fini(void)
dmu_fini();
zio_fini();
ddt_fini();
+ brt_fini();
metaslab_stat_fini();
zfs_btree_fini();
unique_fini();
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 24ae0a00d..9b859adc5 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -23,7 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 Martin Matuska
* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Portions Copyright 2012 Pawel Jakub Dawidek <[email protected]>
+ * Copyright (c) 2012 Pawel Jakub Dawidek
* Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 77bf9140d..d009c58d8 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
*/
@@ -891,5 +892,56 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
zil_itx_assign(zilog, itx, tx);
}
+/*
+ * Handles TX_CLONE_RANGE transactions.
+ */
+void
+zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
+ uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
+ size_t nbps)
+{
+ itx_t *itx;
+ lr_clone_range_t *lr;
+ uint64_t partlen, max_log_data;
+ size_t i, partnbps;
+
+ VERIFY(!zil_replaying(zilog, tx));
+
+ if (zp->z_unlinked)
+ return;
+
+ max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
+
+ while (nbps > 0) {
+ partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
+ partlen = 0;
+ for (i = 0; i < partnbps; i++) {
+ partlen += BP_GET_LSIZE(&bps[i]);
+ }
+ partlen = MIN(partlen, len);
+
+ itx = zil_itx_create(txtype,
+ sizeof (*lr) + sizeof (bps[0]) * partnbps);
+ lr = (lr_clone_range_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = partlen;
+ lr->lr_blksz = blksz;
+ lr->lr_nbps = partnbps;
+ memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+
+ zil_itx_assign(zilog, itx, tx);
+
+ bps += partnbps;
+ ASSERT3U(nbps, >=, partnbps);
+ nbps -= partnbps;
+ off += partlen;
+ ASSERT3U(len, >=, partlen);
+ len -= partlen;
+ }
+}
+
ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW,
"Largest data block to write to zil");
diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c
index a5dc5c399..9b351eefc 100644
--- a/module/zfs/zfs_quota.c
+++ b/module/zfs/zfs_quota.c
@@ -20,8 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <[email protected]>.
- * All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek
* Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index 32be27a8b..04dfda56b 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 Cyril Plisko. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/types.h>
@@ -1162,6 +1163,34 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}
+static int
+zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_clone_range_t *lr = arg2;
+ znode_t *zp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * Clones can be logged out of order, so don't be surprised if
+ * the file is gone - just return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
+ lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
+
+ zrele(zp);
+ return (error);
+}
+
/*
* Callback vectors for replaying records
*/
@@ -1190,4 +1219,5 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_setsaxattr, /* TX_SETSAXATTR */
zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */
zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */
+ zfs_replay_clone_range, /* TX_CLONE_RANGE */
};
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 10677d8d9..db80be783 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -24,6 +24,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -50,6 +51,7 @@
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/policy.h>
+#include <sys/zfeature.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_quota.h>
#include <sys/zfs_vfsops.h>
@@ -501,7 +503,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
}
- if (zn_rlimit_fsize(zp, uio)) {
+ if (zn_rlimit_fsize_uio(zp, uio)) {
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFBIG));
@@ -995,6 +997,467 @@ zfs_get_done(zgd_t *zgd, int error)
kmem_free(zgd, sizeof (zgd_t));
}
+static int
+zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+ int error;
+
+ /* Swap. Not sure if the order of zfs_enter()s is important. */
+ if (zfsvfs1 > zfsvfs2) {
+ zfsvfs_t *tmpzfsvfs;
+
+ tmpzfsvfs = zfsvfs2;
+ zfsvfs2 = zfsvfs1;
+ zfsvfs1 = tmpzfsvfs;
+ }
+
+ error = zfs_enter(zfsvfs1, tag);
+ if (error != 0)
+ return (error);
+ if (zfsvfs1 != zfsvfs2) {
+ error = zfs_enter(zfsvfs2, tag);
+ if (error != 0) {
+ zfs_exit(zfsvfs1, tag);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static void
+zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+
+ zfs_exit(zfsvfs1, tag);
+ if (zfsvfs1 != zfsvfs2)
+ zfs_exit(zfsvfs2, tag);
+}
+
+/*
+ * We split each clone request in chunks that can fit into a single ZIL
+ * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
+ * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
+ * us room for storing 1022 block pointers.
+ *
+ * On success, the function return the number of bytes copied in *lenp.
+ * Note, it doesn't return how much bytes are left to be copied.
+ */
+int
+zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+ uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
+{
+ zfsvfs_t *inzfsvfs, *outzfsvfs;
+ objset_t *inos, *outos;
+ zfs_locked_range_t *inlr, *outlr;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ zilog_t *zilog;
+ uint64_t inoff, outoff, len, done;
+ uint64_t outsize, size;
+ int error;
+ int count = 0;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+ uint64_t uid, gid, projid;
+ blkptr_t *bps;
+ size_t maxblocks, nbps;
+ uint_t inblksz;
+ uint64_t clear_setid_bits_txg = 0;
+
+ inoff = *inoffp;
+ outoff = *outoffp;
+ len = *lenp;
+ done = 0;
+
+ inzfsvfs = ZTOZSB(inzp);
+ outzfsvfs = ZTOZSB(outzp);
+ inos = inzfsvfs->z_os;
+ outos = outzfsvfs->z_os;
+
+ /*
+ * Both source and destination have to belong to the same storage pool.
+ */
+ if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * We need to call zfs_enter() potentially on two different datasets,
+ * so we need a dedicated function for that.
+ */
+ error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
+ if (error != 0)
+ return (error);
+
+ ASSERT(!outzfsvfs->z_replay);
+
+ error = zfs_verify_zp(inzp);
+ if (error == 0)
+ error = zfs_verify_zp(outzp);
+ if (error != 0) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (error);
+ }
+
+ if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+ SPA_FEATURE_BLOCK_CLONING)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * We don't copy source file's flags that's why we don't allow to clone
+ * files that are in quarantine.
+ */
+ if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EACCES));
+ }
+
+ if (inoff >= inzp->z_size) {
+ *lenp = 0;
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (0);
+ }
+ if (len > inzp->z_size - inoff) {
+ len = inzp->z_size - inoff;
+ }
+ if (len == 0) {
+ *lenp = 0;
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (0);
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(outzfsvfs)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM.
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common()
+ */
+ if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * No overlapping if we are cloning within the same file.
+ */
+ if (inzp == outzp) {
+ if (inoff < outoff + len && outoff < inoff + len) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ /*
+ * Maintain predictable lock order.
+ */
+ if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
+ inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+ RL_READER);
+ outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+ RL_WRITER);
+ } else {
+ outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+ RL_WRITER);
+ inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+ RL_READER);
+ }
+
+ inblksz = inzp->z_blksz;
+
+ /*
+ * We cannot clone into files with different block size.
+ */
+ if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
+ error = SET_ERROR(EXDEV);
+ goto unlock;
+ }
+
+ /*
+ * Offsets and len must be at block boundries.
+ */
+ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
+ error = SET_ERROR(EXDEV);
+ goto unlock;
+ }
+ /*
+ * Length must be multipe of blksz, except for the end of the file.
+ */
+ if ((len % inblksz) != 0 &&
+ (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
+ error = SET_ERROR(EXDEV);
+ goto unlock;
+ }
+
+ error = zn_rlimit_fsize(outoff + len);
+ if (error != 0) {
+ goto unlock;
+ }
+
+ if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
+ error = SET_ERROR(EFBIG);
+ goto unlock;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
+ &outzp->z_size, 8);
+
+ zilog = outzfsvfs->z_log;
+ maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
+ sizeof (bps[0]);
+
+ uid = KUID_TO_SUID(ZTOUID(outzp));
+ gid = KGID_TO_SGID(ZTOGID(outzp));
+ projid = outzp->z_projid;
+
+ bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+
+ /*
+ * Clone the file in reasonable size chunks. Each chunk is cloned
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (len > 0) {
+ size = MIN(inblksz * maxblocks, len);
+
+ if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
+ uid) ||
+ zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
+ gid) ||
+ (projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
+ projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(outos);
+
+ nbps = maxblocks;
+ error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps,
+ &nbps);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ /*
+ * If we are tyring to clone a block that was created
+ * in the current transaction group. Return an error,
+ * so the caller can fallback to just copying the data.
+ */
+ if (error == EAGAIN) {
+ error = SET_ERROR(EXDEV);
+ }
+ break;
+ }
+ /*
+ * Encrypted data is fine as long as it comes from the same
+ * dataset.
+ * TODO: We want to extend it in the future to allow cloning to
+ * datasets with the same keys, like clones or to be able to
+ * clone a file from a snapshot of an encrypted dataset into the
+ * dataset itself.
+ */
+ if (BP_IS_PROTECTED(&bps[0])) {
+ if (inzfsvfs != outzfsvfs) {
+ dmu_tx_abort(tx);
+ error = SET_ERROR(EXDEV);
+ break;
+ }
+ }
+
+ dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, outzp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * Copy source znode's block size. This only happens on the
+ * first iteration since zfs_rangelock_reduce() will shrink down
+ * lr_len to the appropriate size.
+ */
+ if (outlr->lr_length == UINT64_MAX) {
+ zfs_grow_blocksize(outzp, inblksz, tx);
+ /*
+ * Round range lock up to the block boundary, so we
+ * prevent appends until we are done.
+ */
+ zfs_rangelock_reduce(outlr, outoff,
+ ((len - 1) / inblksz + 1) * inblksz);
+ }
+
+ dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps,
+ B_FALSE);
+
+ zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
+ &clear_setid_bits_txg, tx);
+
+ zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((outsize = outzp->z_size) < outoff + size) {
+ (void) atomic_cas_64(&outzp->z_size, outsize,
+ outoff + size);
+ }
+
+ error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
+ size, inblksz, bps, nbps);
+
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+
+ inoff += size;
+ outoff += size;
+ len -= size;
+ done += size;
+ }
+
+ kmem_free(bps, sizeof (bps[0]) * maxblocks);
+ zfs_znode_update_vfs(outzp);
+
+unlock:
+ zfs_rangelock_exit(outlr);
+ zfs_rangelock_exit(inlr);
+
+ if (done > 0) {
+ /*
+ * If we have made at least partial progress, reset the error.
+ */
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
+
+ if (outos->os_sync == ZFS_SYNC_ALWAYS) {
+ zil_commit(zilog, outzp->z_id);
+ }
+
+ *inoffp += done;
+ *outoffp += done;
+ *lenp = done;
+ }
+
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
+ * but we cannot do that, because when replaying we don't have source znode
+ * available. This is why we need a dedicated replay function.
+ */
+int
+zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
+ const blkptr_t *bps, size_t nbps)
+{
+ zfsvfs_t *zfsvfs;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ int error;
+ int count = 0;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+
+ ASSERT3U(off, <, MAXOFFSET_T);
+ ASSERT3U(len, >, 0);
+ ASSERT3U(nbps, >, 0);
+
+ zfsvfs = ZTOZSB(zp);
+
+ ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+ SPA_FEATURE_BLOCK_CLONING));
+
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+
+ ASSERT(zfsvfs->z_replay);
+ ASSERT(!zfs_is_readonly(zfsvfs));
+
+ if ((off % blksz) != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
+
+ if (zp->z_blksz < blksz)
+ zfs_grow_blocksize(zp, blksz, tx);
+
+ dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ if (zp->z_size < off + len)
+ zp->z_size = off + len;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ /*
+ * zil_replaying() not only check if we are replaying ZIL, but also
+ * updates the ZIL header to record replay progress.
+ */
+ VERIFY(zil_replaying(zfsvfs->z_log, tx));
+
+ dmu_tx_commit(tx);
+
+ zfs_znode_update_vfs(zp);
+
+ zfs_exit(zfsvfs, FTAG);
+
+ return (error);
+}
+
EXPORT_SYMBOL(zfs_access);
EXPORT_SYMBOL(zfs_fsync);
EXPORT_SYMBOL(zfs_holey);
@@ -1002,6 +1465,8 @@ EXPORT_SYMBOL(zfs_read);
EXPORT_SYMBOL(zfs_write);
EXPORT_SYMBOL(zfs_getsecattr);
EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_clone_range);
+EXPORT_SYMBOL(zfs_clone_range_replay);
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
"Bytes to read per chunk");
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index fcf4e7357..fba1c1999 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -43,6 +43,7 @@
#include <sys/metaslab.h>
#include <sys/trace_zfs.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/wmsum.h>
/*
@@ -578,14 +579,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
}
static int
-zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
- uint64_t first_txg)
+zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
int error;
- if (lrc->lrc_txtype != TX_WRITE)
- return (0);
+ ASSERT(lrc->lrc_txtype == TX_WRITE);
/*
* If the block is not readable, don't claim it. This can happen
@@ -605,6 +604,57 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
}
static int
+zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+ const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+ const blkptr_t *bp;
+ spa_t *spa;
+ uint_t ii;
+
+ ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+
+ if (tx == NULL) {
+ return (0);
+ }
+
+ /*
+ * XXX: Do we need to byteswap lr?
+ */
+
+ spa = zilog->zl_spa;
+
+ for (ii = 0; ii < lr->lr_nbps; ii++) {
+ bp = &lr->lr_bps[ii];
+
+ /*
+ * When data in embedded into BP there is no need to create
+ * BRT entry as there is no data block. Just copy the BP as
+ * it contains the data.
+ */
+ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+ brt_pending_add(spa, bp, tx);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t first_txg)
+{
+
+ switch (lrc->lrc_txtype) {
+ case TX_WRITE:
+ return (zil_claim_write(zilog, lrc, tx, first_txg));
+ case TX_CLONE_RANGE:
+ return (zil_claim_clone_range(zilog, lrc, tx));
+ default:
+ return (0);
+ }
+}
+
+static int
zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
uint64_t claim_txg)
{
@@ -616,24 +666,71 @@ zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
}
static int
-zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
- uint64_t claim_txg)
+zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
+ ASSERT(lrc->lrc_txtype == TX_WRITE);
+
/*
* If we previously claimed it, we need to free it.
*/
- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
- bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
- !BP_IS_HOLE(bp))
+ if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+ !BP_IS_HOLE(bp)) {
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+ }
return (0);
}
static int
+zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+ const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+ const blkptr_t *bp;
+ spa_t *spa;
+ uint_t ii;
+
+ ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+
+ if (tx == NULL) {
+ return (0);
+ }
+
+ spa = zilog->zl_spa;
+
+ for (ii = 0; ii < lr->lr_nbps; ii++) {
+ bp = &lr->lr_bps[ii];
+
+ if (!BP_IS_HOLE(bp)) {
+ zio_free(spa, dmu_tx_get_txg(tx), bp);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t claim_txg)
+{
+
+ if (claim_txg == 0) {
+ return (0);
+ }
+
+ switch (lrc->lrc_txtype) {
+ case TX_WRITE:
+ return (zil_free_write(zilog, lrc, tx, claim_txg));
+ case TX_CLONE_RANGE:
+ return (zil_free_clone_range(zilog, lrc, tx));
+ default:
+ return (0);
+ }
+}
+
+static int
zil_lwb_vdev_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
@@ -1798,13 +1895,12 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
}
/*
- * Maximum amount of write data that can be put into single log block.
+ * Maximum amount of data that can be put into single log block.
*/
uint64_t
-zil_max_log_data(zilog_t *zilog)
+zil_max_log_data(zilog_t *zilog, size_t hdrsize)
{
- return (zilog->zl_max_block_size -
- sizeof (zil_chain_t) - sizeof (lr_write_t));
+ return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
}
/*
@@ -1814,7 +1910,7 @@ zil_max_log_data(zilog_t *zilog)
static inline uint64_t
zil_max_waste_space(zilog_t *zilog)
{
- return (zil_max_log_data(zilog) / 8);
+ return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8);
}
/*
@@ -1887,7 +1983,7 @@ cont:
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
- max_log_data = zil_max_log_data(zilog);
+ max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 ||
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d17ee60dc..1b1a1831f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -41,6 +41,7 @@
#include <sys/zio_checksum.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
@@ -1176,12 +1177,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
}
void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
+ boolean_t brtwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+ ASSERT(!brtwrite || !nopwrite);
/*
* We must reset the io_prop to match the values that existed
@@ -1190,6 +1193,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
*/
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
zio->io_prop.zp_nopwrite = nopwrite;
+ zio->io_prop.zp_brtwrite = brtwrite;
zio->io_prop.zp_copies = copies;
zio->io_bp_override = bp;
}
@@ -1222,7 +1226,8 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
BP_GET_DEDUP(bp) ||
txg != spa->spa_syncing_txg ||
(spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
- !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+ !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
+ brt_maybe_exists(spa, bp)) {
metaslab_check_free(spa, bp);
bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
} else {
@@ -1249,11 +1254,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
arc_freed(spa, bp);
dsl_scan_freed(spa, bp);
- if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+ if (BP_IS_GANG(bp) ||
+ BP_GET_DEDUP(bp) ||
+ brt_maybe_exists(spa, bp)) {
/*
- * GANG and DEDUP blocks can induce a read (for the gang block
- * header, or the DDT), so issue them asynchronously so that
- * this thread is not tied up.
+ * GANG, DEDUP and BRT blocks can induce a read (for the gang
+ * block header, the DDT or the BRT), so issue them
+ * asynchronously so that this thread is not tied up.
*/
enum zio_stage stage =
ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
@@ -1594,11 +1601,15 @@ zio_write_bp_init(zio_t *zio)
zio_prop_t *zp = &zio->io_prop;
ASSERT(bp->blk_birth != zio->io_txg);
- ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (zp->zp_brtwrite)
+ return (zio);
+
+ ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
+
if (BP_IS_EMBEDDED(bp))
return (zio);
@@ -3044,6 +3055,35 @@ zio_nop_write(zio_t *zio)
/*
* ==========================================================================
+ * Block Reference Table
+ * ==========================================================================
+ */
+static zio_t *
+zio_brt_free(zio_t *zio)
+{
+ blkptr_t *bp;
+
+ bp = zio->io_bp;
+
+ if (BP_GET_LEVEL(bp) > 0 ||
+ BP_IS_METADATA(bp) ||
+ !brt_maybe_exists(zio->io_spa, bp)) {
+ return (zio);
+ }
+
+ if (!brt_entry_decref(zio->io_spa, bp)) {
+ /*
+ * This isn't the last reference, so we cannot free
+ * the data yet.
+ */
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ }
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
* Dedup
* ==========================================================================
*/
@@ -4894,6 +4934,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_encrypt,
zio_checksum_generate,
zio_nop_write,
+ zio_brt_free,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 1511f763f..06bc75c63 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -482,6 +482,60 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}
+/*
+ * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
+ * after a system failure.
+ *
+ * TODO: For now we drop block cloning transations for ZVOLs as they are
+ * unsupported, but we still need to inform BRT about that as we
+ * claimed them during pool import.
+ * This situation can occur when we try to import a pool from a ZFS
+ * version supporting block cloning for ZVOLs into a system that
+ * has this ZFS version, that doesn't support block cloning for ZVOLs.
+ */
+static int
+zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ zvol_state_t *zv = arg1;
+ objset_t *os = zv->zv_objset;
+ lr_clone_range_t *lr = arg2;
+ blkptr_t *bp;
+ dmu_tx_t *tx;
+ spa_t *spa;
+ uint_t ii;
+ int error;
+
+ dmu_objset_name(os, name);
+ cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
+ name);
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ tx = dmu_tx_create(os);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ spa = os->os_spa;
+
+ for (ii = 0; ii < lr->lr_nbps; ii++) {
+ bp = &lr->lr_bps[ii];
+
+ if (!BP_IS_HOLE(bp)) {
+ zio_free(spa, dmu_tx_get_txg(tx), bp);
+ }
+ }
+
+ (void) zil_replaying(zv->zv_zilog, tx);
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
static int
zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
{
@@ -516,6 +570,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_SETSAXATTR */
zvol_replay_err, /* TX_RENAME_EXCHANGE */
zvol_replay_err, /* TX_RENAME_WHITEOUT */
+ zvol_replay_clone_range /* TX_CLONE_RANGE */
};
/*