diff options
51 files changed, 3480 insertions, 120 deletions
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 55df1f559..970c45c9b 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -307,6 +307,23 @@ zil_prt_rec_acl(zilog_t *zilog, int txtype, const void *arg) (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); } +static void +zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) +{ + (void) zilog, (void) txtype; + const lr_clone_range_t *lr = arg; + + (void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n", + tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz); + + for (unsigned int i = 0; i < lr->lr_nbps; i++) { + (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, + (u_longlong_t)lr->lr_nbps); + print_log_bp(&lr->lr_bps[i], ""); + } +} + typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *); typedef struct zil_rec_info { zil_prt_rec_func_t zri_print; @@ -340,6 +357,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { .zri_name = "TX_SETSAXATTR "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "}, + {.zri_print = zil_prt_rec_clone_range, + .zri_name = "TX_CLONE_RANGE "}, }; static int diff --git a/cmd/ztest.c b/cmd/ztest.c index fb9f83032..b6b99bfff 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -1902,7 +1902,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) if (zil_replaying(zd->zd_zilog, tx)) return; - if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) + if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, diff --git a/include/Makefile.am b/include/Makefile.am index 6897e3c5e..569de6dfa 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -23,6 +23,7 @@ COMMON_H = \ sys/asm_linkage.h \ sys/avl.h \ sys/avl_impl.h \ + sys/bitmap.h \ sys/bitops.h \ sys/blake3.h \ sys/blkptr.h \ @@ -31,6 +32,7 @@ COMMON_H = \ sys/bptree.h \ sys/bqueue.h \ sys/btree.h \ + sys/brt.h \ sys/dataset_kstats.h \ sys/dbuf.h \ sys/ddt.h \ diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h index 8cde33dbc..050fc3036 100644 --- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -119,7 +119,8 @@ typedef struct zfs_soft_state { #define zn_has_cached_data(zp, start, end) \ vn_has_cached_data(ZTOV(zp)) #define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync) -#define zn_rlimit_fsize(zp, uio) \ +#define zn_rlimit_fsize(size) zfs_rlimit_fsize(size) +#define zn_rlimit_fsize_uio(zp, uio) \ vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio)) /* Called on entry to each ZFS vnode and vfs operation */ @@ -179,6 +180,8 @@ extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE]; extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, char *buf); + +extern int zfs_rlimit_fsize(off_t fsize); #ifdef __cplusplus } #endif diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index 67b4fc906..09d109d19 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -47,6 +47,7 @@ typedef const struct kernel_param zfs_kernel_param_t; enum scope_prefix_types { zfs, zfs_arc, + zfs_brt, zfs_condense, zfs_dbuf, zfs_dbuf_cache, diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index 81607ef2a..0be2c445a 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -86,7 +86,8 @@ extern "C" { #endif #define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync) -#define zn_rlimit_fsize(zp, uio) (0) +#define zn_rlimit_fsize(size) (0) +#define zn_rlimit_fsize_uio(zp, uio) (0) /* * zhold() wraps igrab() on Linux, and igrab() may fail when the diff --git a/include/sys/bitmap.h b/include/sys/bitmap.h new file mode 100644 index 000000000..7b92507a7 --- /dev/null +++ b/include/sys/bitmap.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_BITMAP_H +#define _SYS_BITMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Operations on bitmaps of arbitrary size + * A bitmap is a vector of 1 or more ulong_t's. + * The user of the package is responsible for range checks and keeping + * track of sizes. + */ + +#ifdef _LP64 +#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ +#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#else +#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#endif + +#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ +#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ + +/* + * bitmap is a ulong_t *, bitindex an index_t + * + * The macros BT_WIM and BT_BIW internal; there is no need + * for users of this package to use them. + */ + +/* + * word in map + */ +#define BT_WIM(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT]) +/* + * bit in word + */ +#define BT_BIW(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK)) + +/* + * These are public macros + * + * BT_BITOUL == n bits to n ulong_t's + */ +#define BT_BITOUL(nbits) \ + (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) +#define BT_SIZEOFMAP(nbits) \ + (BT_BITOUL(nbits) * sizeof (ulong_t)) +#define BT_TEST(bitmap, bitindex) \ + ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) +#define BT_SET(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } +#define BT_CLEAR(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITMAP_H */ diff --git a/include/sys/brt.h b/include/sys/brt.h new file mode 100644 index 000000000..b1f701077 --- /dev/null +++ b/include/sys/brt.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#ifndef _SYS_BRT_H +#define _SYS_BRT_H + +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); + +extern uint64_t brt_get_dspace(spa_t *spa); +extern uint64_t brt_get_used(spa_t *spa); +extern uint64_t brt_get_saved(spa_t *spa); +extern uint64_t brt_get_ratio(spa_t *spa); + +extern boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp); +extern void brt_init(void); +extern void brt_fini(void); + +extern void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx); +extern void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx); +extern void brt_pending_apply(spa_t *spa, uint64_t txg); + +extern void brt_create(spa_t *spa); +extern int brt_load(spa_t *spa); +extern void brt_unload(spa_t *spa); +extern void brt_sync(spa_t *spa, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRT_H */ diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index a1ce76b1c..a06316362 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -172,6 +172,7 @@ typedef struct dbuf_dirty_record { override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; + boolean_t dr_brtwrite; boolean_t dr_has_raw_params; /* diff --git a/include/sys/ddt.h b/include/sys/ddt.h index d72401dcf..6378c042c 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -248,6 +248,8 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx); +extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); + extern const ddt_ops_t ddt_zap_ops; #ifdef __cplusplus diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 93de991cc..1b82ff620 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -782,6 +782,8 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); +void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, @@ -1059,6 +1061,12 @@ int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); +int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); +void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, + boolean_t replay); + /* * Initial setup and final teardown. */ diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 81e1ef6c1..ca8514e5d 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -90,6 +90,7 @@ enum dmu_tx_hold_type { THT_ZAP, THT_SPACE, THT_SPILL, + THT_CLONE, THT_NUMTYPES }; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e869685c5..25babd4ea 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -253,6 +253,9 @@ typedef enum { ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, + ZPOOL_PROP_BCLONEUSED, + ZPOOL_PROP_BCLONESAVED, + ZPOOL_PROP_BCLONERATIO, ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index cde08ec9b..8ccd58b58 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -380,6 +380,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 481209b24..a1dfef1d8 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -57,6 +57,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_TRIM (1 << 11) #define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) #define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) +#define ZFS_DEBUG_BRT (1 << 14) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index edff8f681..5da103f17 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -31,6 +31,10 @@ extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_holey(znode_t *, ulong_t, loff_t *); extern int zfs_access(znode_t *, int, int, cred_t *); +extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *, + uint64_t *, cred_t *); +extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, + const blkptr_t *, size_t); extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index fcee55b01..012e7403e 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -315,6 +315,9 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +extern void zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t offset, uint64_t length, uint64_t blksz, + const blkptr_t *bps, size_t nbps); extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern void zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index 9ac421043..cff8ebcad 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -166,7 +166,8 @@ typedef enum zil_create { #define TX_SETSAXATTR 21 /* Set sa xattrs on file */ #define TX_RENAME_EXCHANGE 22 /* Atomic swap via renameat2 */ #define TX_RENAME_WHITEOUT 23 /* Atomic whiteout via renameat2 */ -#define TX_MAX_TYPE 24 /* Max transaction type */ +#define TX_CLONE_RANGE 24 /* Clone a file range */ +#define TX_MAX_TYPE 25 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -176,9 +177,9 @@ typedef enum zil_create { #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* - * Transactions for write, truncate, setattr, acl_v0, and acl can be logged - * out of order. For convenience in the code, all such records must have - * lr_foid at the same offset. + * Transactions for operations below can be logged out of order. + * For convenience in the code, all such records must have lr_foid + * at the same offset. */ #define TX_OOO(txtype) \ ((txtype) == TX_WRITE || \ @@ -187,7 +188,8 @@ typedef enum zil_create { (txtype) == TX_ACL_V0 || \ (txtype) == TX_ACL || \ (txtype) == TX_WRITE2 || \ - (txtype) == TX_SETSAXATTR) + (txtype) == TX_SETSAXATTR || \ + (txtype) == TX_CLONE_RANGE) /* * The number of dnode slots consumed by the object is stored in the 8 @@ -387,6 +389,17 @@ typedef struct { /* lr_acl_bytes number of variable sized ace's follows */ } lr_acl_t; +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to clone into */ + uint64_t lr_offset; /* offset to clone to */ + uint64_t lr_length; /* length of the blocks to clone */ + uint64_t lr_blksz; /* file's block size */ + uint64_t lr_nbps; /* number of block pointers */ + blkptr_t lr_bps[]; + /* block pointers of the blocks to clone follows */ +} lr_clone_range_t; + /* * ZIL structure definitions, interface function prototype and globals. */ @@ -574,7 +587,7 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern uint64_t zil_max_copied_data(zilog_t *zilog); -extern uint64_t zil_max_log_data(zilog_t *zilog); +extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize); extern void zil_sums_init(zil_sums_t *zs); extern void zil_sums_fini(zil_sums_t *zs); diff --git a/include/sys/zio.h b/include/sys/zio.h index 28ed837d8..78603d0eb 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -348,6 +348,7 @@ typedef struct zio_prop { boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; + boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; @@ -556,7 +557,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, - boolean_t nopwrite); + boolean_t nopwrite, boolean_t brtwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 199cca291..29a05986c 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -77,6 +77,12 @@ extern "C" { * and zstd. Compression occurs as part of the write pipeline and is * performed in the ZIO_STAGE_WRITE_BP_INIT stage. * + * Block cloning: + * The block cloning functionality introduces ZIO_STAGE_BRT_FREE stage which + * is called during a free pipeline. If the block is referenced in the + * Block Cloning Table (BRT) we will just decrease its reference counter + * instead of actually freeing the block. + * * Dedup: * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing @@ -127,28 +133,30 @@ enum zio_stage { ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 9, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 12, /* --F-- */ + ZIO_STAGE_BRT_FREE = 1 << 9, /* --F-- */ + + ZIO_STAGE_DDT_READ_START = 1 << 10, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 13, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 13, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 14, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC- */ - ZIO_STAGE_DVA_THROTTLE = 1 << 15, /* -W--- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 16, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 17, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 18, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 18, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C- */ - ZIO_STAGE_READY = 1 << 19, /* RWFCI */ + ZIO_STAGE_READY = 1 << 20, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 20, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 21, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 22, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 23, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R---- */ - ZIO_STAGE_DONE = 1 << 24 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 25 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -233,6 +241,7 @@ enum zio_stage { #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_BRT_FREE | \ ZIO_STAGE_DVA_FREE) #define ZIO_DDT_FREE_PIPELINE \ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 0930bc900..ef915a709 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -78,6 +78,7 @@ typedef enum spa_feature { SPA_FEATURE_ZILSAXATTR, SPA_FEATURE_HEAD_ERRLOG, SPA_FEATURE_BLAKE3, + SPA_FEATURE_BLOCK_CLONING, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 16fea63f8..79c020167 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -590,7 +590,7 @@ <elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> - <elf-symbol name='spa_feature_table' size='2072' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -3174,7 +3174,10 @@ <enumerator name='ZPOOL_PROP_LOAD_GUID' value='30'/> <enumerator name='ZPOOL_PROP_AUTOTRIM' value='31'/> <enumerator name='ZPOOL_PROP_COMPATIBILITY' value='32'/> - <enumerator name='ZPOOL_NUM_PROPS' value='33'/> + <enumerator name='ZPOOL_PROP_BCLONEUSED' value='33'/> + <enumerator name='ZPOOL_PROP_BCLONESAVED' value='34'/> + <enumerator name='ZPOOL_PROP_BCLONERATIO' value='35'/> + <enumerator name='ZPOOL_NUM_PROPS' value='36'/> </enum-decl> <typedef-decl name='zpool_prop_t' type-id='af1ba157' id='5d0c23fb'/> <enum-decl name='vdev_prop_t' naming-typedef-id='5aa5c90c' id='1573bec8'> @@ -4850,8 +4853,8 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'> - <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='16576' id='d95b2b0b'> - <subrange length='37' type-id='7359adad' id='aa6426fb'/> + <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='17024' id='d95b2b0b'> + <subrange length='38' type-id='7359adad' id='aa6426fb'/> </array-type-def> <enum-decl name='spa_feature' id='33ecb627'> <underlying-type type-id='9cac1fee'/> @@ -4893,7 +4896,8 @@ <enumerator name='SPA_FEATURE_ZILSAXATTR' value='34'/> <enumerator name='SPA_FEATURE_HEAD_ERRLOG' value='35'/> <enumerator name='SPA_FEATURE_BLAKE3' value='36'/> - <enumerator name='SPA_FEATURES' value='37'/> + <enumerator name='SPA_FEATURE_BLOCK_CLONING' value='37'/> + <enumerator name='SPA_FEATURES' value='38'/> </enum-decl> <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/> <enum-decl name='zfeature_flags' id='6db816a4'> diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index b3e12bd84..82965f8b9 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -339,6 +339,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, case ZPOOL_PROP_ASHIFT: case ZPOOL_PROP_MAXBLOCKSIZE: case ZPOOL_PROP_MAXDNODESIZE: + case ZPOOL_PROP_BCLONESAVED: + case ZPOOL_PROP_BCLONEUSED: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); @@ -380,6 +382,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, } break; + case ZPOOL_PROP_BCLONERATIO: case ZPOOL_PROP_DEDUPRATIO: if (literal) (void) snprintf(buf, len, "%llu.%02llu", diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 0748f1240..ceac2963e 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -74,6 +74,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/bptree.c \ module/zfs/bqueue.c \ module/zfs/btree.c \ + module/zfs/brt.c \ module/zfs/dbuf.c \ module/zfs/dbuf_stats.c \ module/zfs/ddt.c \ diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 3ff3d97ba..a4d595cd3 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -347,6 +347,20 @@ BLAKE3 is a secure hash algorithm focused on high performance. .Pp .checksum-spiel blake3 . +.feature com.fudosecurity block_cloning yes +When this feature is enabled ZFS will use block cloning for operations like +.Fn copy_file_range 2 . +Block cloning allows to create multiple references to a single block. +It is much faster than copying the data (as the actual data is neither read nor +written) and takes no additional space. +Blocks can be cloned across datasets under some conditions (like disabled +encryption and equal +.Nm recordsize ) . +.Pp +This feature becomes +.Sy active +when first block is cloned. +When the last cloned block is freed, it goes back to the enabled state. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 7be0a21d9..12b9b1190 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -42,13 +42,26 @@ change the behavior of the pool. .Pp The following are read-only properties: .Bl -tag -width "unsupported@guid" -.It Cm allocated +.It Sy allocated Amount of storage used within the pool. See .Sy fragmentation and .Sy free for more information. +.It Sy bcloneratio +The ratio of the total amount of storage that would be required to store all +the cloned blocks without cloning to the actual storage used. +The +.Sy bcloneratio +property is calculated as: +.Pp +.Sy ( ( bclonesaved + bcloneused ) * 100 ) / bcloneused +.It Sy bclonesaved +The amount of additional storage that would be required if block cloning +was not used. +.It Sy bcloneused +The amount of storage used by cloned blocks. .It Sy capacity Percentage of pool space used. This property can also be referred to by its shortened column name, @@ -103,16 +116,16 @@ Over time will decrease while .Sy free increases. -.It Sy leaked -Space not released while -.Sy freeing -due to corruption, now permanently leaked into the pool. +.It Sy guid +A unique identifier for the pool. .It Sy health The current health of the pool. Health can be one of .Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . -.It Sy guid -A unique identifier for the pool. +.It Sy leaked +Space not released while +.Sy freeing +due to corruption, now permanently leaked into the pool. .It Sy load_guid A unique identifier for the pool. Unlike the diff --git a/module/Kbuild.in b/module/Kbuild.in index 21606b8ca..8d29f56c2 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -305,6 +305,7 @@ ZFS_OBJS := \ bpobj.o \ bptree.o \ bqueue.o \ + brt.o \ btree.o \ dataset_kstats.o \ dbuf.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 667678796..8ec094d4a 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -33,11 +33,11 @@ KMOD= openzfs ${SRCDIR}/zstd/lib/decompress CFLAGS+= -I${INCDIR} +CFLAGS+= -I${SRCDIR}/icp/include CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs CFLAGS+= -I${SRCDIR}/zstd/include -CFLAGS+= -I${SRCDIR}/icp/include CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h CFLAGS+= -I${.CURDIR} @@ -243,6 +243,7 @@ SRCS+= abd.c \ blkptr.c \ bplist.c \ bpobj.c \ + brt.c \ btree.c \ cityhash.c \ dbuf.c \ diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 35edea0a2..eccb91def 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -97,6 +97,8 @@ __FBSDID("$FreeBSD$"); SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0, + "ZFS Block Reference Table"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 9fb287313..30851f527 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -153,7 +153,12 @@ struct vfsops zfs_vfsops = { .vfs_quotactl = zfs_quotactl, }; -VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); +#ifdef VFCF_CROSS_COPY_FILE_RANGE +VFS_SET(zfs_vfsops, zfs, + VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE); +#else +VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL); +#endif /* * We need to keep a count of active fs's. diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 148def20c..9169244b1 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -30,7 +30,6 @@ /* Portions Copyright 2010 Robert Milkowski */ -#include <sys/types.h> #include <sys/param.h> #include <sys/time.h> #include <sys/systm.h> @@ -84,6 +83,12 @@ #include <vm/vm_param.h> #include <sys/zil.h> #include <sys/zfs_vnops.h> +#include <sys/module.h> +#include <sys/sysent.h> +#include <security/mac/mac_framework.h> +#include <sys/dmu_impl.h> +#include <sys/brt.h> +#include <sys/zfeature.h> #include <vm/vm_object.h> @@ -6209,6 +6214,93 @@ zfs_deallocate(struct vop_deallocate_args *ap) } #endif +#ifndef _SYS_SYSPROTO_H_ +struct vop_copy_file_range_args { + struct vnode *a_invp; + off_t *a_inoffp; + struct vnode *a_outvp; + off_t *a_outoffp; + size_t *a_lenp; + unsigned int a_flags; + struct ucred *a_incred; + struct ucred *a_outcred; + struct thread *a_fsizetd; +} +#endif +/* + * TODO: FreeBSD will only call file system-specific copy_file_range() if both + * files resides under the same mountpoint. In case of ZFS we want to be called + * even is files are in different datasets (but on the same pools, but we need + * to check that ourselves). + */ +static int +zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) +{ + struct vnode *invp = ap->a_invp; + struct vnode *outvp = ap->a_outvp; + struct mount *mp; + struct uio io; + int error; + + /* + * TODO: If offset/length is not aligned to recordsize, use + * vn_generic_copy_file_range() on this fragment. + * It would be better to do this after we lock the vnodes, but then we + * need something else than vn_generic_copy_file_range(). + */ + + /* Lock both vnodes, avoiding risk of deadlock. */ + do { + mp = NULL; + error = vn_start_write(outvp, &mp, V_WAIT); + if (error == 0) { + error = vn_lock(outvp, LK_EXCLUSIVE); + if (error == 0) { + if (invp == outvp) + break; + error = vn_lock(invp, LK_SHARED | LK_NOWAIT); + if (error == 0) + break; + VOP_UNLOCK(outvp); + if (mp != NULL) + vn_finished_write(mp); + mp = NULL; + error = vn_lock(invp, LK_SHARED); + if (error == 0) + VOP_UNLOCK(invp); + } + } + if (mp != NULL) + vn_finished_write(mp); + } while (error == 0); + if (error != 0) + return (error); +#ifdef MAC + error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred, + outvp); + if (error != 0) + goto unlock; +#endif + + io.uio_offset = *ap->a_outoffp; + io.uio_resid = *ap->a_lenp; + error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd); + if (error != 0) + goto unlock; + + error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), + ap->a_outoffp, ap->a_lenp, ap->a_fsizetd->td_ucred); + +unlock: + if (invp != outvp) + VOP_UNLOCK(invp); + VOP_UNLOCK(outvp); + if (mp != NULL) + vn_finished_write(mp); + + return (error); +} + struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; @@ -6272,6 +6364,7 @@ struct vop_vector zfs_vnodeops = { #if __FreeBSD_version >= 1400043 .vop_add_writecount = vop_stdadd_writecount_nomsync, #endif + .vop_copy_file_range = zfs_freebsd_copy_file_range, }; VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 304bc71f9..dc1d31e1b 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -34,6 +34,7 @@ #include <sys/systm.h> #include <sys/sysmacros.h> #include <sys/resource.h> +#include <sys/resourcevar.h> #include <sys/mntent.h> #include <sys/u8_textprep.h> #include <sys/dsl_dataset.h> @@ -2113,3 +2114,28 @@ zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) return (err); } #endif /* _KERNEL */ + +#ifdef _KERNEL +int +zfs_rlimit_fsize(off_t fsize) +{ + struct thread *td = curthread; + off_t lim; + + if (td == NULL) + return (0); + + lim = lim_cur(td, RLIMIT_FSIZE); + if (__predict_true((uoff_t)fsize <= lim)) + return (0); + + /* + * The limit is reached. + */ + PROC_LOCK(td->td_proc); + kern_psignal(td->td_proc, SIGXFSZ); + PROC_UNLOCK(td->td_proc); + + return (EFBIG); +} +#endif /* _KERNEL */ diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index ea45c9f8a..6fe1da8ed 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -725,6 +725,12 @@ zpool_feature_init(void) blake3_deps, sfeatures); } + zfeature_register(SPA_FEATURE_BLOCK_CLONING, + "com.fudosecurity:block_cloning", "block_cloning", + "Support for block cloning via Block Reference Table.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e99acef5a..459ff62fc 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -116,6 +116,15 @@ zpool_prop_init(void) zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP", B_FALSE, sfeatures); + zprop_register_number(ZPOOL_PROP_BCLONEUSED, "bcloneused", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<size>", + "BCLONE_USED", B_FALSE, sfeatures); + zprop_register_number(ZPOOL_PROP_BCLONESAVED, "bclonesaved", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<size>", + "BCLONE_SAVED", B_FALSE, sfeatures); + zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>", + "BCLONE_RATIO", B_FALSE, sfeatures); /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, diff --git a/module/zfs/brt.c b/module/zfs/brt.c new file mode 100644 index 000000000..ca9c4e678 --- /dev/null +++ b/module/zfs/brt.c @@ -0,0 +1,1884 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/brt.h> +#include <sys/ddt.h> +#include <sys/bitmap.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_scan.h> +#include <sys/vdev_impl.h> +#include <sys/kstat.h> +#include <sys/wmsum.h> + +/* + * Block Cloning design. + * + * Block Cloning allows to manually clone a file (or a subset of its blocks) + * into another (or the same) file by just creating additional references to + * the data blocks without copying the data itself. Those references are kept + * in the Block Reference Tables (BRTs). + * + * In many ways this is similar to the existing deduplication, but there are + * some important differences: + * + * - Deduplication is automatic and Block Cloning is not - one has to use a + * dedicated system call(s) to clone the given file/blocks. + * - Deduplication keeps all data blocks in its table, even those referenced + * just once. Block Cloning creates an entry in its tables only when there + * are at least two references to the given data block. If the block was + * never explicitly cloned or the second to last reference was dropped, + * there will be neither space nor performance overhead. + * - Deduplication needs data to work - one needs to pass real data to the + * write(2) syscall, so hash can be calculated. Block Cloning doesn't require + * data, just block pointers to the data, so it is extremely fast, as we pay + * neither the cost of reading the data, nor the cost of writing the data - + * we operate exclusively on metadata. + * - If the D (dedup) bit is not set in the block pointer, it means that + * the block is not in the dedup table (DDT) and we won't consult the DDT + * when we need to free the block. Block Cloning must be consulted on every + * free, because we cannot modify the source BP (eg. by setting something + * similar to the D bit), thus we have no hint if the block is in the + * Block Reference Table (BRT), so we need to look into the BRT. There is + * an optimization in place that allows us to eliminate the majority of BRT + * lookups which is described below in the "Minimizing free penalty" section. + * - The BRT entry is much smaller than the DDT entry - for BRT we only store + * 64bit offset and 64bit reference counter. + * - Dedup keys are cryptographic hashes, so two blocks that are close to each + * other on disk are most likely in totally different parts of the DDT. + * The BRT entry keys are offsets into a single top-level VDEV, so data blocks + * from one file should have BRT entries close to each other. + * - Scrub will only do a single pass over a block that is referenced multiple + * times in the DDT. Unfortunately it is not currently (if at all) possible + * with Block Cloning and block referenced multiple times will be scrubbed + * multiple times. The new, sorted scrub should be able to eliminate + * duplicated reads given enough memory. + * - Deduplication requires cryptographically strong hash as a checksum or + * additional data verification. Block Cloning works with any checksum + * algorithm or even with checksumming disabled. + * + * As mentioned above, the BRT entries are much smaller than the DDT entries. + * To uniquely identify a block we just need its vdev id and offset. We also + * need to maintain a reference counter. The vdev id will often repeat, as there + * is a small number of top-level VDEVs and a large number of blocks stored in + * each VDEV. We take advantage of that to reduce the BRT entry size further by + * maintaining one BRT for each top-level VDEV, so we can then have only offset + * and counter as the BRT entry. + * + * Minimizing free penalty. + * + * Block Cloning allows creating additional references to any existing block. + * When we free a block there is no hint in the block pointer whether the block + * was cloned or not, so on each free we have to check if there is a + * corresponding entry in the BRT or not. If there is, we need to decrease + * the reference counter. Doing BRT lookup on every free can potentially be + * expensive by requiring additional I/Os if the BRT doesn't fit into memory. + * This is the main problem with deduplication, so we've learned our lesson and + * try not to repeat the same mistake here. How do we do that? We divide each + * top-level VDEV into 16MB regions. For each region we maintain a counter that + * is a sum of all the BRT entries that have offsets within the region. This + * creates the entries count array of 16bit numbers for each top-level VDEV. + * The entries count array is always kept in memory and updated on disk in the + * same transaction group as the BRT updates to keep everything in-sync. We can + * keep the array in memory, because it is very small. With 16MB regions and + * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease + * the region size even further in the future). Now, when we want to free + * a block, we first consult the array. If the counter for the whole region is + * zero, there is no need to look for the BRT entry, as there isn't one for + * sure. If the counter for the region is greater than zero, only then we will + * do a BRT lookup and if an entry is found we will decrease the reference + * counter in the BRT entry and in the entry counters array. + * + * The entry counters array is small, but can potentially be larger for very + * large VDEVs or smaller regions. In this case we don't want to rewrite entire + * array on every change. We then divide the array into 32kB block and keep + * a bitmap of dirty blocks within a transaction group. When we sync the + * transaction group we can only update the parts of the entry counters array + * that were modified. Note: Keeping track of the dirty parts of the entry + * counters array is implemented, but updating only parts of the array on disk + * is not yet implemented - for now we will update entire array if there was + * any change. + * + * The implementation tries to be economic: if BRT is not used, or no longer + * used, there will be no entries in the MOS and no additional memory used (eg. + * the entry counters array is only allocated if needed). + * + * Interaction between Deduplication and Block Cloning. + * + * If both functionalities are in use, we could end up with a block that is + * referenced multiple times in both DDT and BRT. When we free one of the + * references we couldn't tell where it belongs, so we would have to decide + * what table takes the precedence: do we first clear DDT references or BRT + * references? To avoid this dilemma BRT cooperates with DDT - if a given block + * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will + * lookup DDT entry instead and increase the counter there. No BRT entry + * will be created for a block which has the D (dedup) bit set. + * BRT may be more efficient for manual deduplication, but if the block is + * already in the DDT, then creating additional BRT entry would be less + * efficient. This clever idea was proposed by Allan Jude. + * + * Block Cloning across datasets. + * + * Block Cloning is not limited to cloning blocks within the same dataset. + * It is possible (and very useful) to clone blocks between different datasets. + * One use case is recovering files from snapshots. By cloning the files into + * dataset we need no additional storage. Without Block Cloning we would need + * additional space for those files. + * Another interesting use case is moving the files between datasets + * (copying the file content to the new dataset and removing the source file). + * In that case Block Cloning will only be used briefly, because the BRT entries + * will be removed when the source is removed. + * Note: currently it is not possible to clone blocks between encrypted + * datasets, even if those datasets use the same encryption key (this includes + * snapshots of encrypted datasets). Cloning blocks between datasets that use + * the same keys should be possible and should be implemented in the future. + * + * Block Cloning flow through ZFS layers. + * + * Note: Block Cloning can be used both for cloning file system blocks and ZVOL + * blocks. As of this writing no interface is implemented that allows for block + * cloning within a ZVOL. + * FreeBSD and Linux provides copy_file_range(2) system call and we will use it + * for blocking cloning. + * + * ssize_t + * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, + * size_t len, unsigned int flags); + * + * Even though offsets and length represent bytes, they have to be + * block-aligned or we will return the EXDEV error so the upper layer can + * fallback to the generic mechanism that will just copy the data. + * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. + * This function was implemented based on zfs_write(), but instead of writing + * the given data we first read block pointers using the new dmu_read_l0_bps() + * function from the source file. Once we have BPs from the source file we call + * the dmu_brt_clone() function on the destination file. This function + * allocates BPs for us. We iterate over all source BPs. If the given BP is + * a hole or an embedded block, we just copy BP as-is. If it points to a real + * data we place this BP on a BRT pending list using the brt_pending_add() + * function. + * + * We use this pending list to keep track of all BPs that got new references + * within this transaction group. + * + * Some special cases to consider and how we address them: + * - The block we want to clone may have been created within the same + * transaction group that we are trying to clone. Such block has no BP + * allocated yet, so cannot be immediately cloned. We return EXDEV. + * - The block we want to clone may have been modified within the same + * transaction group. We return EXDEV. + * - A block may be cloned multiple times during one transaction group (that's + * why pending list is actually a tree and not an append-only list - this + * way we can figure out faster if this block is cloned for the first time + * in this txg or consecutive time). + * - A block may be cloned and freed within the same transaction group + * (see dbuf_undirty()). + * - A block may be cloned and within the same transaction group the clone + * can be cloned again (see dmu_read_l0_bps()). + * - A file might have been deleted, but the caller still has a file descriptor + * open to this file and clones it. + * + * When we free a block we have an additional step in the ZIO pipeline where we + * call the zio_brt_free() function. We then call the brt_entry_decref() + * that loads the corresponding BRT entry (if one exists) and decreases + * reference counter. If this is not the last reference we will stop ZIO + * pipeline here. If this is the last reference or the block is not in the + * BRT, we continue the pipeline and free the block as usual. + * + * At the beginning of spa_sync() where there can be no more block cloning, + * but before issuing frees we call brt_pending_apply(). This function applies + * all the new clones to the BRT table - we load BRT entries and update + * reference counters. To sync new BRT entries to disk, we use brt_sync() + * function. This function will sync all dirty per-top-level-vdev BRTs, + * the entry counters arrays, etc. + * + * Block Cloning and ZIL. + * + * Every clone operation is divided into chunks (similar to write) and each + * chunk is cloned in a separate transaction. The chunk size is determined by + * how many BPs we can fit into a single ZIL entry. + * Replaying clone operation is different from the regular clone operation, + * as when we log clone operations we cannot use the source object - it may + * reside on a different dataset, so we log BPs we want to clone. + * The ZIL is replayed when we mount the given dataset, not when the pool is + * imported. Taking this into account it is possible that the pool is imported + * without mounting datasets and the source dataset is destroyed before the + * destination dataset is mounted and its ZIL replayed. + * To address this situation we leverage zil_claim() mechanism where ZFS will + * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE + * entries, we will bump reference counters for their BPs in the BRT and then + * on mount and ZIL replay we will just attach BPs to the file without + * bumping reference counters. + * Note it is still possible that after zil_claim() we never mount the + * destination, so we never replay its ZIL and we destroy it. This way we would + * end up with leaked references in BRT. We address that too as ZFS gives us + * a chance to clean this up on dataset destroy (see zil_free_clone_range()). + */ + +/* + * BRT - Block Reference Table. + */ +#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" + +/* + * We divide each VDEV into 16MB chunks. Each chunk is represented in memory + * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B + * Each element in this array represents how many BRT entries do we have in this + * chunk of storage. We always load this entire array into memory and update as + * needed. By having it in memory we can quickly tell (during zio_free()) if + * there are any BRT entries that we might need to update. + * + * This value cannot be larger than 16MB, at least as long as we support + * 512 byte block sizes. With 512 byte block size we can have exactly + * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too + * many for a 16bit counter. + */ +#define BRT_RANGESIZE (16 * 1024 * 1024) +_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, + "BRT_RANGESIZE is too large."); +/* + * We don't want to update the whole structure every time. Maintain bitmap + * of dirty blocks within the regions, so that a single bit represents a + * block size of entcounts. For example if we have a 1PB vdev then all + * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this + * 128MB array of entcounts into 32kB disk blocks, as we don't want to update + * the whole 128MB on disk when we have updated only a single entcount. + * We maintain a bitmap where each 32kB disk block within 128MB entcounts array + * is represented by a single bit. This gives us 4096 bits. A set bit in the + * bitmap means that we had a change in at least one of the 16384 entcounts + * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). + */ +#define BRT_BLOCKSIZE (32 * 1024) +#define BRT_RANGESIZE_TO_NBLOCKS(size) \ + (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) + +#define BRT_LITTLE_ENDIAN 0 +#define BRT_BIG_ENDIAN 1 +#ifdef _ZFS_LITTLE_ENDIAN +#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#else +#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#endif + +typedef struct brt_vdev_phys { + uint64_t bvp_mos_entries; + uint64_t bvp_size; + uint64_t bvp_byteorder; + uint64_t bvp_totalcount; + uint64_t bvp_rangesize; + uint64_t bvp_usedspace; + uint64_t bvp_savedspace; +} brt_vdev_phys_t; + +typedef struct brt_vdev { + /* + * VDEV id. + */ + uint64_t bv_vdevid; + /* + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) + */ + boolean_t bv_initiated; + /* + * Object number in the MOS for the entcount array and brt_vdev_phys. + */ + uint64_t bv_mos_brtvdev; + /* + * Object number in the MOS for the entries table. + */ + uint64_t bv_mos_entries; + /* + * Entries to sync. + */ + avl_tree_t bv_tree; + /* + * Does the bv_entcount[] array needs byte swapping? + */ + boolean_t bv_need_byteswap; + /* + * Number of entries in the bv_entcount[] array. + */ + uint64_t bv_size; + /* + * This is the array with BRT entry count per BRT_RANGESIZE. + */ + uint16_t *bv_entcount; + /* + * Sum of all bv_entcount[]s. + */ + uint64_t bv_totalcount; + /* + * Space on disk occupied by cloned blocks (without compression). + */ + uint64_t bv_usedspace; + /* + * How much additional space would be occupied without block cloning. + */ + uint64_t bv_savedspace; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + uint64_t bv_nblocks; +} brt_vdev_t; + +/* + * In-core brt + */ +typedef struct brt { + krwlock_t brt_lock; + spa_t *brt_spa; +#define brt_mos brt_spa->spa_meta_objset + uint64_t brt_rangesize; + uint64_t brt_usedspace; + uint64_t brt_savedspace; + avl_tree_t brt_pending_tree[TXG_SIZE]; + kmutex_t brt_pending_lock[TXG_SIZE]; + /* Sum of all entries across all bv_trees. */ + uint64_t brt_nentries; + brt_vdev_t *brt_vdevs; + uint64_t brt_nvdevs; +} brt_t; + +/* Size of bre_offset / sizeof (uint64_t). */ +#define BRT_KEY_WORDS (1) + +/* + * In-core brt entry. + * On-disk we use bre_offset as the key and bre_refcount as the value. + */ +typedef struct brt_entry { + uint64_t bre_offset; + uint64_t bre_refcount; + avl_node_t bre_node; +} brt_entry_t; + +typedef struct brt_pending_entry { + blkptr_t bpe_bp; + int bpe_count; + avl_node_t bpe_node; +} brt_pending_entry_t; + +static kmem_cache_t *brt_entry_cache; +static kmem_cache_t *brt_pending_entry_cache; + +/* + * Enable/disable prefetching of BRT entries that we are going to modify. + */ +int zfs_brt_prefetch = 1; + +#ifdef ZFS_DEBUG +#define BRT_DEBUG(...) do { \ + if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ + __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ + } \ +} while (0) +#else +#define BRT_DEBUG(...) do { } while (0) +#endif + +int brt_zap_leaf_blockshift = 12; +int brt_zap_indirect_blockshift = 12; + +static kstat_t *brt_ksp; + +typedef struct brt_stats { + kstat_named_t brt_addref_entry_in_memory; + kstat_named_t brt_addref_entry_not_on_disk; + kstat_named_t brt_addref_entry_on_disk; + kstat_named_t brt_addref_entry_read_lost_race; + kstat_named_t brt_decref_entry_in_memory; + kstat_named_t brt_decref_entry_loaded_from_disk; + kstat_named_t brt_decref_entry_not_in_memory; + kstat_named_t brt_decref_entry_not_on_disk; + kstat_named_t brt_decref_entry_read_lost_race; + kstat_named_t brt_decref_entry_still_referenced; + kstat_named_t brt_decref_free_data_later; + kstat_named_t brt_decref_free_data_now; + kstat_named_t brt_decref_no_entry; +} brt_stats_t; + +static brt_stats_t brt_stats = { + { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, + { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, + { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, + { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, + { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, + { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, + { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, + { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, + { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, + { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, + { "decref_free_data_later", KSTAT_DATA_UINT64 }, + { "decref_free_data_now", KSTAT_DATA_UINT64 }, + { "decref_no_entry", KSTAT_DATA_UINT64 } +}; + +struct { + wmsum_t brt_addref_entry_in_memory; + wmsum_t brt_addref_entry_not_on_disk; + wmsum_t brt_addref_entry_on_disk; + wmsum_t brt_addref_entry_read_lost_race; + wmsum_t brt_decref_entry_in_memory; + wmsum_t brt_decref_entry_loaded_from_disk; + wmsum_t brt_decref_entry_not_in_memory; + wmsum_t brt_decref_entry_not_on_disk; + wmsum_t brt_decref_entry_read_lost_race; + wmsum_t brt_decref_entry_still_referenced; + wmsum_t brt_decref_free_data_later; + wmsum_t brt_decref_free_data_now; + wmsum_t brt_decref_no_entry; +} brt_sums; + +#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) + +static int brt_entry_compare(const void *x1, const void *x2); +static int brt_pending_entry_compare(const void *x1, const void *x2); + +static void +brt_rlock(brt_t *brt) +{ + rw_enter(&brt->brt_lock, RW_READER); +} + +static void +brt_wlock(brt_t *brt) +{ + rw_enter(&brt->brt_lock, RW_WRITER); +} + +static void +brt_unlock(brt_t *brt) +{ + rw_exit(&brt->brt_lock); +} + +static uint16_t +brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) +{ + + ASSERT3U(idx, <, brtvd->bv_size); + + if (brtvd->bv_need_byteswap) { + return (BSWAP_16(brtvd->bv_entcount[idx])); + } else { + return (brtvd->bv_entcount[idx]); + } +} + +static void +brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) +{ + + ASSERT3U(idx, <, brtvd->bv_size); + + if (brtvd->bv_need_byteswap) { + brtvd->bv_entcount[idx] = BSWAP_16(entcnt); + } else { + brtvd->bv_entcount[idx] = entcnt; + } +} + +static void +brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) +{ + uint16_t entcnt; + + ASSERT3U(idx, <, brtvd->bv_size); + + entcnt = brt_vdev_entcount_get(brtvd, idx); + ASSERT(entcnt < UINT16_MAX); + + brt_vdev_entcount_set(brtvd, idx, entcnt + 1); +} + +static void +brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) +{ + uint16_t entcnt; + + ASSERT3U(idx, <, brtvd->bv_size); + + entcnt = brt_vdev_entcount_get(brtvd, idx); + ASSERT(entcnt > 0); + + brt_vdev_entcount_set(brtvd, idx, entcnt - 1); +} + +#ifdef ZFS_DEBUG +static void +brt_vdev_dump(brt_t *brt) +{ + brt_vdev_t *brtvd; + uint64_t vdevid; + + if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { + return; + } + + if (brt->brt_nvdevs == 0) { + zfs_dbgmsg("BRT empty"); + return; + } + + zfs_dbgmsg("BRT vdev dump:"); + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + uint64_t idx; + + brtvd = &brt->brt_vdevs[vdevid]; + zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, + brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, + (u_longlong_t)brtvd->bv_size, + (u_longlong_t)brtvd->bv_totalcount, + (u_longlong_t)brtvd->bv_nblocks, + (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + if (brtvd->bv_totalcount > 0) { + zfs_dbgmsg(" entcounts:"); + for (idx = 0; idx < brtvd->bv_size; idx++) { + if (brt_vdev_entcount_get(brtvd, idx) > 0) { + zfs_dbgmsg(" [%04llu] %hu", + (u_longlong_t)idx, + brt_vdev_entcount_get(brtvd, idx)); + } + } + } + if (brtvd->bv_entcount_dirty) { + char *bitmap; + + bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); + for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap[idx] = + BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; + } + bitmap[idx] = '\0'; + zfs_dbgmsg(" bitmap: %s", bitmap); + kmem_free(bitmap, brtvd->bv_nblocks + 1); + } + } +} +#endif + +static brt_vdev_t * +brt_vdev(brt_t *brt, uint64_t vdevid) +{ + brt_vdev_t *brtvd; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + + if (vdevid < brt->brt_nvdevs) { + brtvd = &brt->brt_vdevs[vdevid]; + } else { + brtvd = NULL; + } + + return (brtvd); +} + +static void +brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +{ + char name[64]; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT0(brtvd->bv_mos_brtvdev); + ASSERT0(brtvd->bv_mos_entries); + ASSERT(brtvd->bv_entcount != NULL); + ASSERT(brtvd->bv_size > 0); + ASSERT(brtvd->bv_bitmap != NULL); + ASSERT(brtvd->bv_nblocks > 0); + + brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, + ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, + brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, + 0, tx); + VERIFY(brtvd->bv_mos_entries != 0); + BRT_DEBUG("MOS entries created, object=%llu", + (u_longlong_t)brtvd->bv_mos_entries); + + /* + * We allocate DMU buffer to store the bv_entcount[] array. + * We will keep array size (bv_size) and cummulative count for all + * bv_entcount[]s (bv_totalcount) in the bonus buffer. + */ + brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, + DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); + VERIFY(brtvd->bv_mos_brtvdev != 0); + BRT_DEBUG("MOS BRT VDEV created, object=%llu", + (u_longlong_t)brtvd->bv_mos_brtvdev); + + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)brtvd->bv_vdevid); + VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); + BRT_DEBUG("Pool directory object created, object=%s", name); + + spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); +} + +static void +brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) +{ + vdev_t *vd; + uint16_t *entcount; + ulong_t *bitmap; + uint64_t nblocks, size; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + + spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); + size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; + spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); + + entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); + nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); + bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); + + if (!brtvd->bv_initiated) { + ASSERT0(brtvd->bv_size); + ASSERT(brtvd->bv_entcount == NULL); + ASSERT(brtvd->bv_bitmap == NULL); + ASSERT0(brtvd->bv_nblocks); + + avl_create(&brtvd->bv_tree, brt_entry_compare, + sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + } else { + ASSERT(brtvd->bv_size > 0); + ASSERT(brtvd->bv_entcount != NULL); + ASSERT(brtvd->bv_bitmap != NULL); + ASSERT(brtvd->bv_nblocks > 0); + /* + * TODO: Allow vdev shrinking. We only need to implement + * shrinking the on-disk BRT VDEV object. + * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, + * size, tx); + */ + ASSERT3U(brtvd->bv_size, <=, size); + + memcpy(entcount, brtvd->bv_entcount, + sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); + memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), + BT_SIZEOFMAP(brtvd->bv_nblocks))); + kmem_free(brtvd->bv_entcount, + sizeof (entcount[0]) * brtvd->bv_size); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + } + + brtvd->bv_size = size; + brtvd->bv_entcount = entcount; + brtvd->bv_bitmap = bitmap; + brtvd->bv_nblocks = nblocks; + if (!brtvd->bv_initiated) { + brtvd->bv_need_byteswap = FALSE; + brtvd->bv_initiated = TRUE; + BRT_DEBUG("BRT VDEV %llu initiated.", + (u_longlong_t)brtvd->bv_vdevid); + } +} + +static void +brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) +{ + char name[64]; + dmu_buf_t *db; + brt_vdev_phys_t *bvphys; + int error; + + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)brtvd->bv_vdevid); + error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); + if (error != 0) + return; + ASSERT(brtvd->bv_mos_brtvdev != 0); + + error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); + ASSERT0(error); + if (error != 0) + return; + + bvphys = db->db_data; + if (brt->brt_rangesize == 0) { + brt->brt_rangesize = bvphys->bvp_rangesize; + } else { + ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); + } + + ASSERT(!brtvd->bv_initiated); + brt_vdev_realloc(brt, brtvd); + + /* TODO: We don't support VDEV shrinking. */ + ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); + + /* + * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. + */ + error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), + brtvd->bv_entcount, DMU_READ_NO_PREFETCH); + ASSERT0(error); + + brtvd->bv_mos_entries = bvphys->bvp_mos_entries; + ASSERT(brtvd->bv_mos_entries != 0); + brtvd->bv_need_byteswap = + (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); + brtvd->bv_totalcount = bvphys->bvp_totalcount; + brtvd->bv_usedspace = bvphys->bvp_usedspace; + brtvd->bv_savedspace = bvphys->bvp_savedspace; + brt->brt_usedspace += brtvd->bv_usedspace; + brt->brt_savedspace += brtvd->bv_savedspace; + + dmu_buf_rele(db, FTAG); + + BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", + name, (u_longlong_t)brtvd->bv_mos_brtvdev, + (u_longlong_t)brtvd->bv_mos_entries); +} + +static void +brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) +{ + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); + + kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); + brtvd->bv_entcount = NULL; + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + brtvd->bv_bitmap = NULL; + ASSERT0(avl_numnodes(&brtvd->bv_tree)); + avl_destroy(&brtvd->bv_tree); + + brtvd->bv_size = 0; + brtvd->bv_nblocks = 0; + + brtvd->bv_initiated = FALSE; + BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); +} + +static void +brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +{ + char name[64]; + uint64_t count; + dmu_buf_t *db; + brt_vdev_phys_t *bvphys; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_mos_brtvdev != 0); + ASSERT(brtvd->bv_mos_entries != 0); + + VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); + VERIFY0(count); + VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); + BRT_DEBUG("MOS entries destroyed, object=%llu", + (u_longlong_t)brtvd->bv_mos_entries); + brtvd->bv_mos_entries = 0; + + VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + bvphys = db->db_data; + ASSERT0(bvphys->bvp_totalcount); + ASSERT0(bvphys->bvp_usedspace); + ASSERT0(bvphys->bvp_savedspace); + dmu_buf_rele(db, FTAG); + + VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); + BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", + (u_longlong_t)brtvd->bv_mos_brtvdev); + brtvd->bv_mos_brtvdev = 0; + + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)brtvd->bv_vdevid); + VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + BRT_DEBUG("Pool directory object removed, object=%s", name); + + brt_vdev_dealloc(brt, brtvd); + + spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); +} + +static void +brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) +{ + brt_vdev_t *brtvd, *vdevs; + uint64_t vdevid; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT3U(nvdevs, >, brt->brt_nvdevs); + + vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); + if (brt->brt_nvdevs > 0) { + ASSERT(brt->brt_vdevs != NULL); + + memcpy(vdevs, brt->brt_vdevs, + sizeof (brt_vdev_t) * brt->brt_nvdevs); + kmem_free(brt->brt_vdevs, + sizeof (brt_vdev_t) * brt->brt_nvdevs); + } + for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { + brtvd = &vdevs[vdevid]; + + brtvd->bv_vdevid = vdevid; + brtvd->bv_initiated = FALSE; + } + + BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", + (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); + + brt->brt_vdevs = vdevs; + brt->brt_nvdevs = nvdevs; +} + +static boolean_t +brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) +{ + uint64_t idx; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + + idx = bre->bre_offset / brt->brt_rangesize; + if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { + /* VDEV wasn't expanded. */ + return (brt_vdev_entcount_get(brtvd, idx) > 0); + } + + return (FALSE); +} + +static void +brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize) +{ + uint64_t idx; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + ASSERT(brtvd != NULL); + ASSERT(brtvd->bv_entcount != NULL); + + brt->brt_savedspace += dsize; + brtvd->bv_savedspace += dsize; + brtvd->bv_meta_dirty = TRUE; + + if (bre->bre_refcount > 1) { + return; + } + + brt->brt_usedspace += dsize; + brtvd->bv_usedspace += dsize; + + idx = bre->bre_offset / brt->brt_rangesize; + if (idx >= brtvd->bv_size) { + /* VDEV has been expanded. */ + brt_vdev_realloc(brt, brtvd); + } + + ASSERT3U(idx, <, brtvd->bv_size); + + brtvd->bv_totalcount++; + brt_vdev_entcount_inc(brtvd, idx); + brtvd->bv_entcount_dirty = TRUE; + idx = idx / BRT_BLOCKSIZE / 8; + BT_SET(brtvd->bv_bitmap, idx); + +#ifdef ZFS_DEBUG + brt_vdev_dump(brt); +#endif +} + +static void +brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize) +{ + uint64_t idx; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd != NULL); + ASSERT(brtvd->bv_entcount != NULL); + + brt->brt_savedspace -= dsize; + brtvd->bv_savedspace -= dsize; + brtvd->bv_meta_dirty = TRUE; + + if (bre->bre_refcount > 0) { + return; + } + + brt->brt_usedspace -= dsize; + brtvd->bv_usedspace -= dsize; + + idx = bre->bre_offset / brt->brt_rangesize; + ASSERT3U(idx, <, brtvd->bv_size); + + ASSERT(brtvd->bv_totalcount > 0); + brtvd->bv_totalcount--; + brt_vdev_entcount_dec(brtvd, idx); + brtvd->bv_entcount_dirty = TRUE; + idx = idx / BRT_BLOCKSIZE / 8; + BT_SET(brtvd->bv_bitmap, idx); + +#ifdef ZFS_DEBUG + brt_vdev_dump(brt); +#endif +} + +static void +brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +{ + dmu_buf_t *db; + brt_vdev_phys_t *bvphys; + + ASSERT(brtvd->bv_meta_dirty); + ASSERT(brtvd->bv_mos_brtvdev != 0); + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + + if (brtvd->bv_entcount_dirty) { + /* + * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. + */ + dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), + brtvd->bv_entcount, tx); + memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); + brtvd->bv_entcount_dirty = FALSE; + } + + dmu_buf_will_dirty(db, tx); + bvphys = db->db_data; + bvphys->bvp_mos_entries = brtvd->bv_mos_entries; + bvphys->bvp_size = brtvd->bv_size; + if (brtvd->bv_need_byteswap) { + bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; + } else { + bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; + } + bvphys->bvp_totalcount = brtvd->bv_totalcount; + bvphys->bvp_rangesize = brt->brt_rangesize; + bvphys->bvp_usedspace = brtvd->bv_usedspace; + bvphys->bvp_savedspace = brtvd->bv_savedspace; + dmu_buf_rele(db, FTAG); + + brtvd->bv_meta_dirty = FALSE; +} + +static void +brt_vdevs_alloc(brt_t *brt, boolean_t load) +{ + brt_vdev_t *brtvd; + uint64_t vdevid; + + brt_wlock(brt); + + brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); + + if (load) { + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brtvd = &brt->brt_vdevs[vdevid]; + ASSERT(brtvd->bv_entcount == NULL); + + brt_vdev_load(brt, brtvd); + } + } + + if (brt->brt_rangesize == 0) { + brt->brt_rangesize = BRT_RANGESIZE; + } + + brt_unlock(brt); +} + +static void +brt_vdevs_free(brt_t *brt) +{ + brt_vdev_t *brtvd; + uint64_t vdevid; + + brt_wlock(brt); + + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd->bv_initiated) + brt_vdev_dealloc(brt, brtvd); + } + kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); + + brt_unlock(brt); +} + +static void +brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) +{ + + bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); + bre->bre_refcount = 0; + + *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); +} + +static int +brt_entry_compare(const void *x1, const void *x2) +{ + const brt_entry_t *bre1 = x1; + const brt_entry_t *bre2 = x2; + + return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); +} + +static int +brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) +{ + uint64_t mos_entries; + uint64_t one, physsize; + int error; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + + if (!brt_vdev_lookup(brt, brtvd, bre)) + return (SET_ERROR(ENOENT)); + + /* + * Remember mos_entries object number. After we reacquire the BRT lock, + * the brtvd pointer may be invalid. + */ + mos_entries = brtvd->bv_mos_entries; + if (mos_entries == 0) + return (SET_ERROR(ENOENT)); + + brt_unlock(brt); + + error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, + BRT_KEY_WORDS, &one, &physsize); + if (error == 0) { + ASSERT3U(one, ==, 1); + ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); + + error = zap_lookup_uint64(brt->brt_mos, mos_entries, + &bre->bre_offset, BRT_KEY_WORDS, 1, + sizeof (bre->bre_refcount), &bre->bre_refcount); + BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " + "count=%llu error=%d", (u_longlong_t)mos_entries, + (u_longlong_t)brtvd->bv_vdevid, + (u_longlong_t)bre->bre_offset, + error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); + } + + brt_wlock(brt); + + return (error); +} + +static void +brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) +{ + brt_vdev_t *brtvd; + uint64_t mos_entries = 0; + + brt_rlock(brt); + brtvd = brt_vdev(brt, vdevid); + if (brtvd != NULL) + mos_entries = brtvd->bv_mos_entries; + brt_unlock(brt); + + if (mos_entries == 0) + return; + + BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", + (u_longlong_t)mos_entries, (u_longlong_t)vdevid, + (u_longlong_t)bre->bre_offset); + (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, + (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); +} + +static int +brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) +{ + int error; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_mos_entries != 0); + ASSERT(bre->bre_refcount > 0); + + error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, + (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, + sizeof (bre->bre_refcount), &bre->bre_refcount, tx); + BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " + "error=%d", (u_longlong_t)brtvd->bv_mos_entries, + (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, + (u_longlong_t)bre->bre_refcount, error); + + return (error); +} + +static int +brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) +{ + int error; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_mos_entries != 0); + ASSERT0(bre->bre_refcount); + + error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, + (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); + BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " + "error=%d", (u_longlong_t)brtvd->bv_mos_entries, + (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, + (u_longlong_t)bre->bre_refcount, error); + + return (error); +} + +/* + * Return TRUE if we _can_ have BRT entry for this bp. It might be false + * positive, but gives us quick answer if we should look into BRT, which + * may require reads and thus will be more expensive. + */ +boolean_t +brt_maybe_exists(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search; + boolean_t mayexists = FALSE; + uint64_t vdevid; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + if (brtvd != NULL && brtvd->bv_initiated) { + if (!avl_is_empty(&brtvd->bv_tree) || + brt_vdev_lookup(brt, brtvd, &bre_search)) { + mayexists = TRUE; + } + } + + brt_unlock(brt); + + return (mayexists); +} + +uint64_t +brt_get_dspace(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return (0); + + return (brt->brt_savedspace); +} + +uint64_t +brt_get_used(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return (0); + + return (brt->brt_usedspace); +} + +uint64_t +brt_get_saved(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return (0); + + return (brt->brt_savedspace); +} + +uint64_t +brt_get_ratio(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt->brt_usedspace == 0) + return (100); + + return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / + brt->brt_usedspace); +} + +static int +brt_kstats_update(kstat_t *ksp, int rw) +{ + brt_stats_t *bs = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + bs->brt_addref_entry_in_memory.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_in_memory); + bs->brt_addref_entry_not_on_disk.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); + bs->brt_addref_entry_on_disk.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_on_disk); + bs->brt_addref_entry_read_lost_race.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); + bs->brt_decref_entry_in_memory.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_in_memory); + bs->brt_decref_entry_loaded_from_disk.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); + bs->brt_decref_entry_not_in_memory.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); + bs->brt_decref_entry_not_on_disk.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); + bs->brt_decref_entry_read_lost_race.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); + bs->brt_decref_entry_still_referenced.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_still_referenced); + bs->brt_decref_free_data_later.value.ui64 = + wmsum_value(&brt_sums.brt_decref_free_data_later); + bs->brt_decref_free_data_now.value.ui64 = + wmsum_value(&brt_sums.brt_decref_free_data_now); + bs->brt_decref_no_entry.value.ui64 = + wmsum_value(&brt_sums.brt_decref_no_entry); + + return (0); +} + +static void +brt_stat_init(void) +{ + + wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); + wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); + wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); + wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); + wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); + wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); + wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); + wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); + wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); + wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); + wmsum_init(&brt_sums.brt_decref_free_data_later, 0); + wmsum_init(&brt_sums.brt_decref_free_data_now, 0); + wmsum_init(&brt_sums.brt_decref_no_entry, 0); + + brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, + sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (brt_ksp != NULL) { + brt_ksp->ks_data = &brt_stats; + brt_ksp->ks_update = brt_kstats_update; + kstat_install(brt_ksp); + } +} + +static void +brt_stat_fini(void) +{ + if (brt_ksp != NULL) { + kstat_delete(brt_ksp); + brt_ksp = NULL; + } + + wmsum_fini(&brt_sums.brt_addref_entry_in_memory); + wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); + wmsum_fini(&brt_sums.brt_addref_entry_on_disk); + wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); + wmsum_fini(&brt_sums.brt_decref_entry_in_memory); + wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); + wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); + wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); + wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); + wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); + wmsum_fini(&brt_sums.brt_decref_free_data_later); + wmsum_fini(&brt_sums.brt_decref_free_data_now); + wmsum_fini(&brt_sums.brt_decref_no_entry); +} + +void +brt_init(void) +{ + brt_entry_cache = kmem_cache_create("brt_entry_cache", + sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", + sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + brt_stat_init(); +} + +void +brt_fini(void) +{ + brt_stat_fini(); + + kmem_cache_destroy(brt_entry_cache); + kmem_cache_destroy(brt_pending_entry_cache); +} + +static brt_entry_t * +brt_entry_alloc(const brt_entry_t *bre_init) +{ + brt_entry_t *bre; + + bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); + bre->bre_offset = bre_init->bre_offset; + bre->bre_refcount = bre_init->bre_refcount; + + return (bre); +} + +static void +brt_entry_free(brt_entry_t *bre) +{ + + kmem_cache_free(brt_entry_cache, bre); +} + +static void +brt_entry_addref(brt_t *brt, const blkptr_t *bp) +{ + brt_vdev_t *brtvd; + brt_entry_t *bre, *racebre; + brt_entry_t bre_search; + avl_index_t where; + uint64_t vdevid; + int error; + + ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_wlock(brt); + + brtvd = brt_vdev(brt, vdevid); + if (brtvd == NULL) { + ASSERT3U(vdevid, >=, brt->brt_nvdevs); + + /* New VDEV was added. */ + brt_vdevs_expand(brt, vdevid + 1); + brtvd = brt_vdev(brt, vdevid); + } + ASSERT(brtvd != NULL); + if (!brtvd->bv_initiated) + brt_vdev_realloc(brt, brtvd); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre != NULL) { + BRTSTAT_BUMP(brt_addref_entry_in_memory); + } else { + /* + * brt_entry_lookup() may drop the BRT (read) lock and + * reacquire it (write). + */ + error = brt_entry_lookup(brt, brtvd, &bre_search); + /* bre_search now contains correct bre_refcount */ + ASSERT(error == 0 || error == ENOENT); + if (error == 0) + BRTSTAT_BUMP(brt_addref_entry_on_disk); + else + BRTSTAT_BUMP(brt_addref_entry_not_on_disk); + /* + * When the BRT lock was dropped, brt_vdevs[] may have been + * expanded and reallocated, we need to update brtvd's pointer. + */ + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); + if (racebre == NULL) { + bre = brt_entry_alloc(&bre_search); + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + avl_insert(&brtvd->bv_tree, bre, where); + brt->brt_nentries++; + } else { + /* + * The entry was added when the BRT lock was dropped in + * brt_entry_lookup(). + */ + BRTSTAT_BUMP(brt_addref_entry_read_lost_race); + bre = racebre; + } + } + bre->bre_refcount++; + brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + + brt_unlock(brt); +} + +/* Return TRUE if block should be freed immediately. */ +boolean_t +brt_entry_decref(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t *bre, *racebre; + brt_entry_t bre_search; + avl_index_t where; + uint64_t vdevid; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_wlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre != NULL) { + BRTSTAT_BUMP(brt_decref_entry_in_memory); + goto out; + } else { + BRTSTAT_BUMP(brt_decref_entry_not_in_memory); + } + + /* + * brt_entry_lookup() may drop the BRT lock and reacquire it. + */ + error = brt_entry_lookup(brt, brtvd, &bre_search); + /* bre_search now contains correct bre_refcount */ + ASSERT(error == 0 || error == ENOENT); + /* + * When the BRT lock was dropped, brt_vdevs[] may have been expanded + * and reallocated, we need to update brtvd's pointer. + */ + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + if (error == ENOENT) { + BRTSTAT_BUMP(brt_decref_entry_not_on_disk); + bre = NULL; + goto out; + } + + racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); + if (racebre != NULL) { + /* + * The entry was added when the BRT lock was dropped in + * brt_entry_lookup(). + */ + BRTSTAT_BUMP(brt_decref_entry_read_lost_race); + bre = racebre; + goto out; + } + + BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); + bre = brt_entry_alloc(&bre_search); + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + avl_insert(&brtvd->bv_tree, bre, where); + brt->brt_nentries++; + +out: + if (bre == NULL) { + /* + * This is a free of a regular (not cloned) block. + */ + brt_unlock(brt); + BRTSTAT_BUMP(brt_decref_no_entry); + return (B_TRUE); + } + if (bre->bre_refcount == 0) { + brt_unlock(brt); + BRTSTAT_BUMP(brt_decref_free_data_now); + return (B_TRUE); + } + + ASSERT(bre->bre_refcount > 0); + bre->bre_refcount--; + if (bre->bre_refcount == 0) + BRTSTAT_BUMP(brt_decref_free_data_later); + else + BRTSTAT_BUMP(brt_decref_entry_still_referenced); + brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + + brt_unlock(brt); + + return (B_FALSE); +} + +static void +brt_prefetch(brt_t *brt, const blkptr_t *bp) +{ + brt_entry_t bre; + uint64_t vdevid; + + ASSERT(bp != NULL); + + if (!zfs_brt_prefetch) + return; + + brt_entry_fill(bp, &bre, &vdevid); + + brt_entry_prefetch(brt, vdevid, &bre); +} + +static int +brt_pending_entry_compare(const void *x1, const void *x2) +{ + const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; + const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; + int cmp; + + cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); + if (cmp == 0) { + cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), + DVA_GET_VDEV(&bp2->blk_dva[0])); + if (cmp == 0) { + cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0])); + } + } + + return (cmp); +} + +void +brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) +{ + brt_t *brt; + avl_tree_t *pending_tree; + kmutex_t *pending_lock; + brt_pending_entry_t *bpe, *newbpe; + avl_index_t where; + uint64_t txg; + + brt = spa->spa_brt; + txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, !=, 0); + pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; + + newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); + newbpe->bpe_bp = *bp; + newbpe->bpe_count = 1; + + mutex_enter(pending_lock); + + bpe = avl_find(pending_tree, newbpe, &where); + if (bpe == NULL) { + avl_insert(pending_tree, newbpe, where); + newbpe = NULL; + } else { + bpe->bpe_count++; + } + + mutex_exit(pending_lock); + + if (newbpe != NULL) { + ASSERT(bpe != NULL); + ASSERT(bpe != newbpe); + kmem_cache_free(brt_pending_entry_cache, newbpe); + } else { + ASSERT(bpe == NULL); + } + + /* Prefetch BRT entry, as we will need it in the syncing context. */ + brt_prefetch(brt, bp); +} + +void +brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) +{ + brt_t *brt; + avl_tree_t *pending_tree; + kmutex_t *pending_lock; + brt_pending_entry_t *bpe, bpe_search; + uint64_t txg; + + brt = spa->spa_brt; + txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, !=, 0); + pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; + + bpe_search.bpe_bp = *bp; + + mutex_enter(pending_lock); + + bpe = avl_find(pending_tree, &bpe_search, NULL); + /* I believe we should always find bpe when this function is called. */ + if (bpe != NULL) { + ASSERT(bpe->bpe_count > 0); + + bpe->bpe_count--; + if (bpe->bpe_count == 0) { + avl_remove(pending_tree, bpe); + kmem_cache_free(brt_pending_entry_cache, bpe); + } + } + + mutex_exit(pending_lock); +} + +void +brt_pending_apply(spa_t *spa, uint64_t txg) +{ + brt_t *brt; + brt_pending_entry_t *bpe; + avl_tree_t *pending_tree; + kmutex_t *pending_lock; + void *c; + + ASSERT3U(txg, !=, 0); + + brt = spa->spa_brt; + pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; + + mutex_enter(pending_lock); + + c = NULL; + while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { + boolean_t added_to_ddt; + + mutex_exit(pending_lock); + + for (int i = 0; i < bpe->bpe_count; i++) { + /* + * If the block has DEDUP bit set, it means that it + * already exists in the DEDUP table, so we can just + * use that instead of creating new entry in + * the BRT table. + */ + if (BP_GET_DEDUP(&bpe->bpe_bp)) { + added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); + } else { + added_to_ddt = B_FALSE; + } + if (!added_to_ddt) + brt_entry_addref(brt, &bpe->bpe_bp); + } + + kmem_cache_free(brt_pending_entry_cache, bpe); + mutex_enter(pending_lock); + } + + mutex_exit(pending_lock); +} + +static void +brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) +{ + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_mos_entries != 0); + + if (bre->bre_refcount == 0) { + int error; + + error = brt_entry_remove(brt, brtvd, bre, tx); + ASSERT(error == 0 || error == ENOENT); + /* + * If error == ENOENT then zfs_clone_range() was done from a + * removed (but opened) file (open(), unlink()). + */ + ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); + } else { + VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); + } +} + +static void +brt_sync_table(brt_t *brt, dmu_tx_t *tx) +{ + brt_vdev_t *brtvd; + brt_entry_t *bre; + uint64_t vdevid; + void *c; + + brt_wlock(brt); + + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brtvd = &brt->brt_vdevs[vdevid]; + + if (!brtvd->bv_initiated) + continue; + + if (!brtvd->bv_meta_dirty) { + ASSERT(!brtvd->bv_entcount_dirty); + ASSERT0(avl_numnodes(&brtvd->bv_tree)); + continue; + } + + ASSERT(!brtvd->bv_entcount_dirty || + avl_numnodes(&brtvd->bv_tree) != 0); + + if (brtvd->bv_mos_brtvdev == 0) + brt_vdev_create(brt, brtvd, tx); + + c = NULL; + while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { + brt_sync_entry(brt, brtvd, bre, tx); + brt_entry_free(bre); + ASSERT(brt->brt_nentries > 0); + brt->brt_nentries--; + } + + brt_vdev_sync(brt, brtvd, tx); + + if (brtvd->bv_totalcount == 0) + brt_vdev_destroy(brt, brtvd, tx); + } + + ASSERT0(brt->brt_nentries); + + brt_unlock(brt); +} + +void +brt_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + brt_t *brt; + + ASSERT(spa_syncing_txg(spa) == txg); + + brt = spa->spa_brt; + brt_rlock(brt); + if (brt->brt_nentries == 0) { + /* No changes. */ + brt_unlock(brt); + return; + } + brt_unlock(brt); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + brt_sync_table(brt, tx); + + dmu_tx_commit(tx); +} + +static void +brt_table_alloc(brt_t *brt) +{ + + for (int i = 0; i < TXG_SIZE; i++) { + avl_create(&brt->brt_pending_tree[i], + brt_pending_entry_compare, + sizeof (brt_pending_entry_t), + offsetof(brt_pending_entry_t, bpe_node)); + mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, + NULL); + } +} + +static void +brt_table_free(brt_t *brt) +{ + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); + + avl_destroy(&brt->brt_pending_tree[i]); + mutex_destroy(&brt->brt_pending_lock[i]); + } +} + +static void +brt_alloc(spa_t *spa) +{ + brt_t *brt; + + ASSERT(spa->spa_brt == NULL); + + brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); + rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); + brt->brt_spa = spa; + brt->brt_rangesize = 0; + brt->brt_nentries = 0; + brt->brt_vdevs = NULL; + brt->brt_nvdevs = 0; + brt_table_alloc(brt); + + spa->spa_brt = brt; +} + +void +brt_create(spa_t *spa) +{ + + brt_alloc(spa); + brt_vdevs_alloc(spa->spa_brt, B_FALSE); +} + +int +brt_load(spa_t *spa) +{ + + brt_alloc(spa); + brt_vdevs_alloc(spa->spa_brt, B_TRUE); + + return (0); +} + +void +brt_unload(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return; + + brt_vdevs_free(brt); + brt_table_free(brt); + rw_destroy(&brt->brt_lock); + kmem_free(brt, sizeof (*brt)); + spa->spa_brt = NULL; +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, + "Enable prefetching of BRT entries"); +#ifdef ZFS_BRT_DEBUG +ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); +#endif +/* END CSTYLED */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 191e5e043..94c2ae9d7 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include <sys/zfs_context.h> @@ -49,6 +50,7 @@ #include <sys/trace_zfs.h> #include <sys/callb.h> #include <sys/abd.h> +#include <sys/brt.h> #include <sys/vdev.h> #include <cityhash.h> #include <sys/spa_impl.h> @@ -1427,7 +1429,7 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) } static void -dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) +dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp) { blkptr_t *bps = db->db.db_data; uint32_t indbs = 1ULL << dn->dn_indblkshift; @@ -1436,12 +1438,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) for (int i = 0; i < n_bps; i++) { blkptr_t *bp = &bps[i]; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs); - BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? - dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); - BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); - BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); - BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs); + BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ? + dn->dn_datablksz : BP_GET_LSIZE(dbbp)); + BP_SET_TYPE(bp, BP_GET_TYPE(dbbp)); + BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1); + BP_SET_BIRTH(bp, dbbp->blk_birth, 0); } } @@ -1451,30 +1453,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) * was taken, ENOENT if no action was taken. */ static int -dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn) +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); - int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); + int is_hole = bp == NULL || BP_IS_HOLE(bp); /* * For level 0 blocks only, if the above check fails: * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ - if (!is_hole && db->db_level == 0) { - is_hole = dnode_block_freed(dn, db->db_blkid) || - BP_IS_HOLE(db->db_blkptr); - } + if (!is_hole && db->db_level == 0) + is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp); if (is_hole) { dbuf_set_data(db, dbuf_alloc_arcbuf(db)); memset(db->db.db_data, 0, db->db.db_size); - if (db->db_blkptr != NULL && db->db_level > 0 && - BP_IS_HOLE(db->db_blkptr) && - db->db_blkptr->blk_birth != 0) { - dbuf_handle_indirect_hole(db, dn); + if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) && + bp->blk_birth != 0) { + dbuf_handle_indirect_hole(db, dn, bp); } db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "hole read satisfied"); @@ -1551,12 +1550,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; + blkptr_t bp, *bpp; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_buf == NULL); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); @@ -1566,16 +1566,46 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, goto early_unlock; } - err = dbuf_read_hole(db, dn); + if (db->db_state == DB_UNCACHED) { + if (db->db_blkptr == NULL) { + bpp = NULL; + } else { + bp = *db->db_blkptr; + bpp = &bp; + } + } else { + struct dirty_leaf *dl; + dbuf_dirty_record_t *dr; + + ASSERT3S(db->db_state, ==, DB_NOFILL); + + dr = list_head(&db->db_dirty_records); + if (dr == NULL) { + err = EIO; + goto early_unlock; + } else { + dl = &dr->dt.dl; + if (!dl->dr_brtwrite) { + err = EIO; + goto early_unlock; + } + bp = dl->dr_overridden_by; + bpp = &bp; + } + } + + err = dbuf_read_hole(db, dn, bpp); if (err == 0) goto early_unlock; + ASSERT(bpp != NULL); + /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ - if (BP_IS_REDACTED(db->db_blkptr)) { + if (BP_IS_REDACTED(bpp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); @@ -1590,7 +1620,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { spa_log_error(db->db_objset->os_spa, &zb); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); @@ -1621,15 +1651,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; /* - * The zio layer will copy the provided blkptr later, but we need to - * do this now so that we can release the parent's rwlock. We have to - * do that now so that if dbuf_read_done is called synchronously (on + * The zio layer will copy the provided blkptr later, but we have our + * own copy so that we can release the parent's rwlock. We have to + * do that so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ - blkptr_t bp = *db->db_blkptr; dmu_buf_unlock_parent(db, dblt, tag); - (void) arc_read(zio, db->db_objset->os_spa, &bp, + (void) arc_read(zio, db->db_objset->os_spa, bpp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); return (err); @@ -1731,9 +1760,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - if (db->db_state == DB_NOFILL) - return (SET_ERROR(EIO)); - DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1780,13 +1806,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED) { + } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) { boolean_t need_wait = B_FALSE; db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - if (zio == NULL && - db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { + if (zio == NULL && (db->db_state == DB_NOFILL || + (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { spa_t *spa = dn->dn_objset->os_spa; zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; @@ -1913,7 +1939,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - arc_release(dr->dt.dl.dr_data, db); + if (!dr->dt.dl.dr_brtwrite) + arc_release(dr->dt.dl.dr_data, db); } /* @@ -1996,6 +2023,11 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); + if (dr->dt.dl.dr_brtwrite) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } } else { /* * This dbuf is not dirty in the open context. @@ -2285,7 +2317,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - if (db->db_blkid != DMU_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) { dmu_objset_willuse_space(os, db->db.db_size, tx); } @@ -2328,8 +2360,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } - if (db->db_blkid != DMU_BONUS_BLKID) + if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) { dr->dr_accounted = db->db.db_size; + } dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; list_insert_before(&db->db_dirty_records, dr_next, dr); @@ -2489,6 +2522,7 @@ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; + boolean_t brtwrite; ASSERT(txg != 0); @@ -2513,6 +2547,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); ASSERT(dr->dr_dbuf == db); + brtwrite = dr->dt.dl.dr_brtwrite; + if (brtwrite) { + /* + * We are freeing a block that we cloned in the same + * transaction group. + */ + brt_pending_remove(dmu_objset_spa(db->db_objset), + &dr->dt.dl.dr_overridden_by, tx); + } + dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -2542,7 +2586,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); } - if (db->db_state != DB_NOFILL) { + if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); @@ -2557,7 +2601,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + ASSERT(db->db_state == DB_NOFILL || brtwrite || + arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } @@ -4748,8 +4793,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) + if (dr->dt.dl.dr_data != NULL && + dr->dt.dl.dr_data != db->db_buf) { arc_buf_destroy(dr->dt.dl.dr_data, db); + } } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -5046,7 +5093,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, + dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 7880a899a..33fea0ba3 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2022 by Pawel Jakub Dawidek */ #include <sys/zfs_context.h> @@ -1180,5 +1181,59 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) return (SET_ERROR(ENOENT)); } +/* + * This function is used by Block Cloning (brt.c) to increase reference + * counter for the DDT entry if the block is already in DDT. + * + * Return false if the block, despite having the D bit set, is not present + * in the DDT. Currently this is not possible but might be in the future. + * See the comment below. + */ +boolean_t +ddt_addref(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t *dde; + boolean_t result; + + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + ddt = ddt_select(spa, bp); + ddt_enter(ddt); + + dde = ddt_lookup(ddt, bp, B_TRUE); + ASSERT(dde != NULL); + + if (dde->dde_type < DDT_TYPES) { + ddt_phys_t *ddp; + + ASSERT3S(dde->dde_class, <, DDT_CLASSES); + + ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; + if (ddp->ddp_refcnt == 0) { + /* This should never happen? */ + ddt_phys_fill(ddp, bp); + } + ddt_phys_addref(ddp); + result = B_TRUE; + } else { + /* + * At the time of implementating this if the block has the + * DEDUP flag set it must exist in the DEDUP table, but + * there are many advocates that want ability to remove + * entries from DDT with refcnt=1. If this will happen, + * we may have a block with the DEDUP set, but which doesn't + * have a corresponding entry in the DDT. Be ready. + */ + ASSERT3S(dde->dde_class, ==, DDT_CLASSES); + ddt_remove(ddt, dde); + result = B_FALSE; + } + + ddt_exit(ddt); + spa_config_exit(spa, SCL_ZIO, FTAG); + + return (result); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 9b8fc7e49..e6bade11c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -29,6 +29,7 @@ * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include <sys/dmu.h> @@ -52,6 +53,7 @@ #include <sys/sa.h> #include <sys/zfeature.h> #include <sys/abd.h> +#include <sys/brt.h> #include <sys/trace_zfs.h> #include <sys/zfs_racct.h> #include <sys/zfs_rlock.h> @@ -513,7 +515,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio_t *zio = NULL; boolean_t missed = B_FALSE; - ASSERT(length <= DMU_MAX_ACCESS); + ASSERT(!read || length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that @@ -2165,6 +2167,155 @@ restart: return (err); } +int +dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, + dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp) +{ + dmu_buf_t **dbp, *dbuf; + dmu_buf_impl_t *db; + blkptr_t *bp; + int error, numbufs; + + error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, + &numbufs, &dbp); + if (error != 0) { + if (error == ESRCH) { + error = SET_ERROR(ENXIO); + } + return (error); + } + + ASSERT3U(numbufs, <=, *nbpsp); + + for (int i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = db->db_blkptr; + + /* + * If the block is not on the disk yet, it has no BP assigned. + * There is not much we can do... + */ + if (!list_is_empty(&db->db_dirty_records)) { + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + if (dr->dt.dl.dr_brtwrite) { + /* + * This is very special case where we clone a + * block and in the same transaction group we + * read its BP (most likely to clone the clone). + */ + bp = &dr->dt.dl.dr_overridden_by; + } else { + /* + * The block was modified in the same + * transaction group. + */ + error = SET_ERROR(EAGAIN); + goto out; + } + } + if (bp == NULL) { + /* + * The block was created in this transaction group, + * so it has no BP yet. + */ + error = SET_ERROR(EAGAIN); + goto out; + } + if (dmu_buf_is_dirty(dbuf, tx)) { + error = SET_ERROR(EAGAIN); + goto out; + } + /* + * Make sure we clone only data blocks. + */ + if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) { + error = SET_ERROR(EINVAL); + goto out; + } + + bps[i] = *bp; + } + + *nbpsp = numbufs; +out: + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (error); +} + +void +dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, + dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) +{ + spa_t *spa; + dmu_buf_t **dbp, *dbuf; + dmu_buf_impl_t *db; + struct dirty_leaf *dl; + dbuf_dirty_record_t *dr; + const blkptr_t *bp; + int numbufs; + + spa = os->os_spa; + + VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, + &numbufs, &dbp)); + ASSERT3U(nbps, ==, numbufs); + + for (int i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = &bps[i]; + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); + + if (db->db_state == DB_UNCACHED) { + /* + * XXX-PJD: If the dbuf is already cached, calling + * dmu_buf_will_not_fill() will panic on assertion + * (db->db_buf == NULL) in dbuf_clear_data(), + * which is called from dbuf_noread() in DB_NOFILL + * case. I'm not 100% sure this is the right thing + * to do, but it seems to work. + */ + dmu_buf_will_not_fill(dbuf, tx); + } + + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; + dl->dr_overridden_by = *bp; + dl->dr_brtwrite = B_TRUE; + + dl->dr_override_state = DR_OVERRIDDEN; + if (BP_IS_HOLE(bp)) { + dl->dr_overridden_by.blk_birth = 0; + dl->dr_overridden_by.blk_phys_birth = 0; + } else { + dl->dr_overridden_by.blk_birth = dr->dr_txg; + dl->dr_overridden_by.blk_phys_birth = + BP_PHYSICAL_BIRTH(bp); + } + + /* + * When data in embedded into BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as + * it contains the data. + * Also, when replaying ZIL we don't want to bump references + * in the BRT as it was already done during ZIL claim. + */ + if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + brt_pending_add(spa, bp, tx); + } + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 815e27a6c..1c5608c45 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -349,7 +349,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx) } static void -dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; @@ -357,15 +357,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) ASSERT(tx->tx_txg == 0); - dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; - dmu_tx_count_dnode(txh); - /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. @@ -445,8 +441,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); - if (txh != NULL) - (void) dmu_tx_hold_free_impl(txh, off, len); + if (txh != NULL) { + dmu_tx_count_dnode(txh); + dmu_tx_count_free(txh, off, len); + } } void @@ -455,8 +453,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) dmu_tx_hold_t *txh; txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); - if (txh != NULL) - (void) dmu_tx_hold_free_impl(txh, off, len); + if (txh != NULL) { + dmu_tx_count_dnode(txh); + dmu_tx_count_free(txh, off, len); + } +} + +static void +dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + + /* + * Reuse dmu_tx_count_free(), it does exactly what we need for clone. + */ + dmu_tx_count_free(txh, off, len); +} + +void +dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); + if (txh != NULL) { + dmu_tx_count_dnode(txh); + dmu_tx_count_clone(txh, off, len); + } } static void @@ -667,6 +692,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) case THT_NEWOBJECT: match_object = TRUE; break; + case THT_CLONE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + break; default: cmn_err(CE_PANIC, "bad txh_type %d", txh->txh_type); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 021cba68c..8e3fd126c 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -47,6 +47,7 @@ #include <sys/vdev_impl.h> #include <sys/zil_impl.h> #include <sys/zio_checksum.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/sa.h> #include <sys/sa_impl.h> @@ -3499,11 +3500,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_dedup_frees_this_txg = 0; /* - * Write out changes to the DDT that may be required as a - * result of the blocks freed. This ensures that the DDT - * is clean when a scrub/resilver runs. + * Write out changes to the DDT and the BRT that may be required + * as a result of the blocks freed. This ensures that the DDT + * and the BRT are clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); + brt_sync(spa, tx->tx_txg); } if (err != 0) return (err); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6be6fe115..98a302237 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -52,6 +52,7 @@ #include <sys/dmu_tx.h> #include <sys/zap.h> #include <sys/zil.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/vdev_impl.h> #include <sys/vdev_removal.h> @@ -341,6 +342,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, + brt_get_used(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, + brt_get_saved(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, + brt_get_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); @@ -1707,6 +1714,7 @@ spa_unload(spa_t *spa) } ddt_unload(spa); + brt_unload(spa); spa_unload_log_sm_metadata(spa); /* @@ -4415,6 +4423,21 @@ spa_ld_load_dedup_tables(spa_t *spa) } static int +spa_ld_load_brt(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = brt_load(spa); + if (error != 0) { + spa_load_failed(spa, "brt_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) { vdev_t *rvd = spa->spa_root_vdev; @@ -4895,6 +4918,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) if (error != 0) return (error); + error = spa_ld_load_brt(spa); + if (error != 0) + return (error); + /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. @@ -5963,6 +5990,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, * Create DDTs (dedup tables). */ ddt_create(spa); + /* + * Create BRT table and BRT table object. + */ + brt_create(spa); spa_update_dspace(spa); @@ -9138,6 +9169,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) &spa->spa_deferred_bpobj, tx); } + brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); svr_sync(spa, tx); @@ -9263,6 +9295,13 @@ spa_sync(spa_t *spa, uint64_t txg) ZIO_FLAG_CANFAIL); /* + * Now that there can be no more cloning in this transaction group, + * but we are still before issuing frees, we can process pending BRT + * updates. + */ + brt_pending_apply(spa, txg); + + /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 53763e915..8466fa80e 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -57,6 +57,7 @@ #include <sys/fs/zfs.h> #include <sys/metaslab_impl.h> #include <sys/arc.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/kstat.h> #include "zfs_prop.h" @@ -1834,7 +1835,7 @@ void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + - ddt_get_dedup_dspace(spa); + ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that @@ -2410,6 +2411,7 @@ spa_init(spa_mode_t mode) unique_init(); zfs_btree_init(); metaslab_stat_init(); + brt_init(); ddt_init(); zio_init(); dmu_init(); @@ -2446,6 +2448,7 @@ spa_fini(void) dmu_fini(); zio_fini(); ddt_fini(); + brt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 24ae0a00d..9b859adc5 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. - * Portions Copyright 2012 Pawel Jakub Dawidek <[email protected]> + * Copyright (c) 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 77bf9140d..d009c58d8 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2018 by Delphix. All rights reserved. + * Copyright (c) 2022 by Pawel Jakub Dawidek */ @@ -891,5 +892,56 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, zil_itx_assign(zilog, itx, tx); } +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t i, partnbps; + + VERIFY(!zil_replaying(zilog, tx)); + + if (zp->z_unlinked) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = 0; + for (i = 0; i < partnbps; i++) { + partlen += BP_GET_LSIZE(&bps[i]); + } + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + itx->itx_sync = (zp->z_sync_cnt != 0); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW, "Largest data block to write to zil"); diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c index a5dc5c399..9b351eefc 100644 --- a/module/zfs/zfs_quota.c +++ b/module/zfs/zfs_quota.c @@ -20,8 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek <[email protected]>. - * All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 32be27a8b..04dfda56b 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Cyril Plisko. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include <sys/types.h> @@ -1162,6 +1163,34 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) return (error); } +static int +zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_clone_range_t *lr = arg2; + znode_t *zp; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * Clones can be logged out of order, so don't be surprised if + * the file is gone - just return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length, + lr->lr_blksz, lr->lr_bps, lr->lr_nbps); + + zrele(zp); + return (error); +} + /* * Callback vectors for replaying records */ @@ -1190,4 +1219,5 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_setsaxattr, /* TX_SETSAXATTR */ zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */ zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */ + zfs_replay_clone_range, /* TX_CLONE_RANGE */ }; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 10677d8d9..db80be783 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ /* Portions Copyright 2007 Jeremy Teo */ @@ -50,6 +51,7 @@ #include <sys/txg.h> #include <sys/dbuf.h> #include <sys/policy.h> +#include <sys/zfeature.h> #include <sys/zfs_vnops.h> #include <sys/zfs_quota.h> #include <sys/zfs_vfsops.h> @@ -501,7 +503,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } - if (zn_rlimit_fsize(zp, uio)) { + if (zn_rlimit_fsize_uio(zp, uio)) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); @@ -995,6 +997,467 @@ zfs_get_done(zgd_t *zgd, int error) kmem_free(zgd, sizeof (zgd_t)); } +static int +zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) +{ + int error; + + /* Swap. Not sure if the order of zfs_enter()s is important. */ + if (zfsvfs1 > zfsvfs2) { + zfsvfs_t *tmpzfsvfs; + + tmpzfsvfs = zfsvfs2; + zfsvfs2 = zfsvfs1; + zfsvfs1 = tmpzfsvfs; + } + + error = zfs_enter(zfsvfs1, tag); + if (error != 0) + return (error); + if (zfsvfs1 != zfsvfs2) { + error = zfs_enter(zfsvfs2, tag); + if (error != 0) { + zfs_exit(zfsvfs1, tag); + return (error); + } + } + + return (0); +} + +static void +zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) +{ + + zfs_exit(zfsvfs1, tag); + if (zfsvfs1 != zfsvfs2) + zfs_exit(zfsvfs2, tag); +} + +/* + * We split each clone request in chunks that can fit into a single ZIL + * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning + * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives + * us room for storing 1022 block pointers. + * + * On success, the function return the number of bytes copied in *lenp. + * Note, it doesn't return how much bytes are left to be copied. + */ +int +zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, + uint64_t *outoffp, uint64_t *lenp, cred_t *cr) +{ + zfsvfs_t *inzfsvfs, *outzfsvfs; + objset_t *inos, *outos; + zfs_locked_range_t *inlr, *outlr; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + zilog_t *zilog; + uint64_t inoff, outoff, len, done; + uint64_t outsize, size; + int error; + int count = 0; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + uint64_t uid, gid, projid; + blkptr_t *bps; + size_t maxblocks, nbps; + uint_t inblksz; + uint64_t clear_setid_bits_txg = 0; + + inoff = *inoffp; + outoff = *outoffp; + len = *lenp; + done = 0; + + inzfsvfs = ZTOZSB(inzp); + outzfsvfs = ZTOZSB(outzp); + inos = inzfsvfs->z_os; + outos = outzfsvfs->z_os; + + /* + * Both source and destination have to belong to the same storage pool. + */ + if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* + * We need to call zfs_enter() potentially on two different datasets, + * so we need a dedicated function for that. + */ + error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); + if (error != 0) + return (error); + + ASSERT(!outzfsvfs->z_replay); + + error = zfs_verify_zp(inzp); + if (error == 0) + error = zfs_verify_zp(outzp); + if (error != 0) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (error); + } + + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* + * We don't copy source file's flags that's why we don't allow to clone + * files that are in quarantine. + */ + if (inzp->z_pflags & ZFS_AV_QUARANTINED) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EACCES)); + } + + if (inoff >= inzp->z_size) { + *lenp = 0; + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (0); + } + if (len > inzp->z_size - inoff) { + len = inzp->z_size - inoff; + } + if (len == 0) { + *lenp = 0; + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (0); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(outzfsvfs)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EPERM)); + } + + /* + * No overlapping if we are cloning within the same file. + */ + if (inzp == outzp) { + if (inoff < outoff + len && outoff < inoff + len) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EINVAL)); + } + } + + /* + * Maintain predictable lock order. + */ + if (inzp < outzp || (inzp == outzp && inoff < outoff)) { + inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, + RL_WRITER); + } else { + outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, + RL_WRITER); + inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, + RL_READER); + } + + inblksz = inzp->z_blksz; + + /* + * We cannot clone into files with different block size. + */ + if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) { + error = SET_ERROR(EXDEV); + goto unlock; + } + + /* + * Offsets and len must be at block boundries. + */ + if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { + error = SET_ERROR(EXDEV); + goto unlock; + } + /* + * Length must be multipe of blksz, except for the end of the file. + */ + if ((len % inblksz) != 0 && + (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { + error = SET_ERROR(EXDEV); + goto unlock; + } + + error = zn_rlimit_fsize(outoff + len); + if (error != 0) { + goto unlock; + } + + if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { + error = SET_ERROR(EFBIG); + goto unlock; + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, + &outzp->z_size, 8); + + zilog = outzfsvfs->z_log; + maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / + sizeof (bps[0]); + + uid = KUID_TO_SUID(ZTOUID(outzp)); + gid = KGID_TO_SGID(ZTOGID(outzp)); + projid = outzp->z_projid; + + bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + + /* + * Clone the file in reasonable size chunks. Each chunk is cloned + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (len > 0) { + size = MIN(inblksz * maxblocks, len); + + if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, + uid) || + zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, + gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(outos); + + nbps = maxblocks; + error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps, + &nbps); + if (error != 0) { + dmu_tx_abort(tx); + /* + * If we are tyring to clone a block that was created + * in the current transaction group. Return an error, + * so the caller can fallback to just copying the data. + */ + if (error == EAGAIN) { + error = SET_ERROR(EXDEV); + } + break; + } + /* + * Encrypted data is fine as long as it comes from the same + * dataset. + * TODO: We want to extend it in the future to allow cloning to + * datasets with the same keys, like clones or to be able to + * clone a file from a snapshot of an encrypted dataset into the + * dataset itself. + */ + if (BP_IS_PROTECTED(&bps[0])) { + if (inzfsvfs != outzfsvfs) { + dmu_tx_abort(tx); + error = SET_ERROR(EXDEV); + break; + } + } + + dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, outzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + break; + } + + /* + * Copy source znode's block size. This only happens on the + * first iteration since zfs_rangelock_reduce() will shrink down + * lr_len to the appropriate size. + */ + if (outlr->lr_length == UINT64_MAX) { + zfs_grow_blocksize(outzp, inblksz, tx); + /* + * Round range lock up to the block boundary, so we + * prevent appends until we are done. + */ + zfs_rangelock_reduce(outlr, outoff, + ((len - 1) / inblksz + 1) * inblksz); + } + + dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps, + B_FALSE); + + zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, + &clear_setid_bits_txg, tx); + + zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((outsize = outzp->z_size) < outoff + size) { + (void) atomic_cas_64(&outzp->z_size, outsize, + outoff + size); + } + + error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); + + zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, + size, inblksz, bps, nbps); + + dmu_tx_commit(tx); + + if (error != 0) + break; + + inoff += size; + outoff += size; + len -= size; + done += size; + } + + kmem_free(bps, sizeof (bps[0]) * maxblocks); + zfs_znode_update_vfs(outzp); + +unlock: + zfs_rangelock_exit(outlr); + zfs_rangelock_exit(inlr); + + if (done > 0) { + /* + * If we have made at least partial progress, reset the error. + */ + error = 0; + + ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); + + if (outos->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zilog, outzp->z_id); + } + + *inoffp += done; + *outoffp += done; + *lenp = done; + } + + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + + return (error); +} + +/* + * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), + * but we cannot do that, because when replaying we don't have source znode + * available. This is why we need a dedicated replay function. + */ +int +zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, + const blkptr_t *bps, size_t nbps) +{ + zfsvfs_t *zfsvfs; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + int error; + int count = 0; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + + ASSERT3U(off, <, MAXOFFSET_T); + ASSERT3U(len, >, 0); + ASSERT3U(nbps, >, 0); + + zfsvfs = ZTOZSB(zp); + + ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), + SPA_FEATURE_BLOCK_CLONING)); + + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + + ASSERT(zfsvfs->z_replay); + ASSERT(!zfs_is_readonly(zfsvfs)); + + if ((off % blksz) != 0) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(EINVAL)); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + zfs_exit(zfsvfs, FTAG); + return (error); + } + + if (zp->z_blksz < blksz) + zfs_grow_blocksize(zp, blksz, tx); + + dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + if (zp->z_size < off + len) + zp->z_size = off + len; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + /* + * zil_replaying() not only check if we are replaying ZIL, but also + * updates the ZIL header to record replay progress. + */ + VERIFY(zil_replaying(zfsvfs->z_log, tx)); + + dmu_tx_commit(tx); + + zfs_znode_update_vfs(zp); + + zfs_exit(zfsvfs, FTAG); + + return (error); +} + EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); @@ -1002,6 +1465,8 @@ EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); +EXPORT_SYMBOL(zfs_clone_range); +EXPORT_SYMBOL(zfs_clone_range_replay); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index fcf4e7357..fba1c1999 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -43,6 +43,7 @@ #include <sys/metaslab.h> #include <sys/trace_zfs.h> #include <sys/abd.h> +#include <sys/brt.h> #include <sys/wmsum.h> /* @@ -578,14 +579,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, } static int -zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, - uint64_t first_txg) +zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; - if (lrc->lrc_txtype != TX_WRITE) - return (0); + ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If the block is not readable, don't claim it. This can happen @@ -605,6 +604,57 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, } static int +zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +{ + const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; + const blkptr_t *bp; + spa_t *spa; + uint_t ii; + + ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + + if (tx == NULL) { + return (0); + } + + /* + * XXX: Do we need to byteswap lr? + */ + + spa = zilog->zl_spa; + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + + /* + * When data in embedded into BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as + * it contains the data. + */ + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + brt_pending_add(spa, bp, tx); + } + } + + return (0); +} + +static int +zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) +{ + + switch (lrc->lrc_txtype) { + case TX_WRITE: + return (zil_claim_write(zilog, lrc, tx, first_txg)); + case TX_CLONE_RANGE: + return (zil_claim_clone_range(zilog, lrc, tx)); + default: + return (0); + } +} + +static int zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t claim_txg) { @@ -616,24 +666,71 @@ zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, } static int -zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, - uint64_t claim_txg) +zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; + ASSERT(lrc->lrc_txtype == TX_WRITE); + /* * If we previously claimed it, we need to free it. */ - if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && - bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && - !BP_IS_HOLE(bp)) + if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && + !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + } return (0); } static int +zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +{ + const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; + const blkptr_t *bp; + spa_t *spa; + uint_t ii; + + ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + + if (tx == NULL) { + return (0); + } + + spa = zilog->zl_spa; + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + + if (!BP_IS_HOLE(bp)) { + zio_free(spa, dmu_tx_get_txg(tx), bp); + } + } + + return (0); +} + +static int +zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t claim_txg) +{ + + if (claim_txg == 0) { + return (0); + } + + switch (lrc->lrc_txtype) { + case TX_WRITE: + return (zil_free_write(zilog, lrc, tx, claim_txg)); + case TX_CLONE_RANGE: + return (zil_free_clone_range(zilog, lrc, tx)); + default: + return (0); + } +} + +static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; @@ -1798,13 +1895,12 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) } /* - * Maximum amount of write data that can be put into single log block. + * Maximum amount of data that can be put into single log block. */ uint64_t -zil_max_log_data(zilog_t *zilog) +zil_max_log_data(zilog_t *zilog, size_t hdrsize) { - return (zilog->zl_max_block_size - - sizeof (zil_chain_t) - sizeof (lr_write_t)); + return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize); } /* @@ -1814,7 +1910,7 @@ zil_max_log_data(zilog_t *zilog) static inline uint64_t zil_max_waste_space(zilog_t *zilog) { - return (zil_max_log_data(zilog) / 8); + return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8); } /* @@ -1887,7 +1983,7 @@ cont: * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_sz - lwb->lwb_nused; - max_log_data = zil_max_log_data(zilog); + max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d17ee60dc..1b1a1831f 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -41,6 +41,7 @@ #include <sys/zio_checksum.h> #include <sys/dmu_objset.h> #include <sys/arc.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> @@ -1176,12 +1177,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, } void -zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, + boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed @@ -1190,6 +1193,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; + zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } @@ -1222,7 +1226,8 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && - !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { + !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || + brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { @@ -1249,11 +1254,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, arc_freed(spa, bp); dsl_scan_freed(spa, bp); - if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) { + if (BP_IS_GANG(bp) || + BP_GET_DEDUP(bp) || + brt_maybe_exists(spa, bp)) { /* - * GANG and DEDUP blocks can induce a read (for the gang block - * header, or the DDT), so issue them asynchronously so that - * this thread is not tied up. + * GANG, DEDUP and BRT blocks can induce a read (for the gang + * block header, the DDT or the BRT), so issue them + * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; @@ -1594,11 +1601,15 @@ zio_write_bp_init(zio_t *zio) zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); - ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (zp->zp_brtwrite) + return (zio); + + ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); + if (BP_IS_EMBEDDED(bp)) return (zio); @@ -3044,6 +3055,35 @@ zio_nop_write(zio_t *zio) /* * ========================================================================== + * Block Reference Table + * ========================================================================== + */ +static zio_t * +zio_brt_free(zio_t *zio) +{ + blkptr_t *bp; + + bp = zio->io_bp; + + if (BP_GET_LEVEL(bp) > 0 || + BP_IS_METADATA(bp) || + !brt_maybe_exists(zio->io_spa, bp)) { + return (zio); + } + + if (!brt_entry_decref(zio->io_spa, bp)) { + /* + * This isn't the last reference, so we cannot free + * the data yet. + */ + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } + + return (zio); +} + +/* + * ========================================================================== * Dedup * ========================================================================== */ @@ -4894,6 +4934,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_encrypt, zio_checksum_generate, zio_nop_write, + zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 1511f763f..06bc75c63 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -482,6 +482,60 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } +/* + * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed + * after a system failure. + * + * TODO: For now we drop block cloning transations for ZVOLs as they are + * unsupported, but we still need to inform BRT about that as we + * claimed them during pool import. + * This situation can occur when we try to import a pool from a ZFS + * version supporting block cloning for ZVOLs into a system that + * has this ZFS version, that doesn't support block cloning for ZVOLs. + */ +static int +zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + zvol_state_t *zv = arg1; + objset_t *os = zv->zv_objset; + lr_clone_range_t *lr = arg2; + blkptr_t *bp; + dmu_tx_t *tx; + spa_t *spa; + uint_t ii; + int error; + + dmu_objset_name(os, name); + cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", + name); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + tx = dmu_tx_create(os); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + spa = os->os_spa; + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + + if (!BP_IS_HOLE(bp)) { + zio_free(spa, dmu_tx_get_txg(tx), bp); + } + } + + (void) zil_replaying(zv->zv_zilog, tx); + dmu_tx_commit(tx); + + return (0); +} + static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -516,6 +570,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ + zvol_replay_clone_range /* TX_CLONE_RANGE */ }; /* diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 99a70fa2c..097cd52e4 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -58,6 +58,9 @@ typeset -a properties=( "multihost" "autotrim" "compatibility" + "bcloneused" + "bclonesaved" + "bcloneratio" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" @@ -100,5 +103,6 @@ if is_linux || is_freebsd; then "feature@zilsaxattr" "feature@head_errlog" "feature@blake3" + "feature@block_cloning" ) fi |