diff options
author | Pawel Jakub Dawidek <[email protected]> | 2023-03-10 20:59:53 +0100 |
---|---|---|
committer | GitHub <[email protected]> | 2023-03-10 11:59:53 -0800 |
commit | 67a1b0379159c46bcd60a462a2790248046c8804 (patch) | |
tree | dbba99ec9db66f8afefebad07caa22d36f04f3ff /include/sys | |
parent | da19d919a853ad05ef300fe000e6c96c4db84bcf (diff) |
Implementation of block cloning for ZFS
Block Cloning allows to manually clone a file (or a subset of its
blocks) into another (or the same) file by just creating additional
references to the data blocks without copying the data itself.
Those references are kept in the Block Reference Tables (BRTs).
The whole design of block cloning is documented in module/zfs/brt.c.
Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Christian Schwarz <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Rich Ercolani <[email protected]>
Signed-off-by: Pawel Jakub Dawidek <[email protected]>
Closes #13392
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/bitmap.h | 93 | ||||
-rw-r--r-- | include/sys/brt.h | 62 | ||||
-rw-r--r-- | include/sys/dbuf.h | 1 | ||||
-rw-r--r-- | include/sys/ddt.h | 2 | ||||
-rw-r--r-- | include/sys/dmu.h | 8 | ||||
-rw-r--r-- | include/sys/dmu_tx.h | 1 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 3 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 1 | ||||
-rw-r--r-- | include/sys/zfs_debug.h | 1 | ||||
-rw-r--r-- | include/sys/zfs_vnops.h | 4 | ||||
-rw-r--r-- | include/sys/zfs_znode.h | 3 | ||||
-rw-r--r-- | include/sys/zil.h | 25 | ||||
-rw-r--r-- | include/sys/zio.h | 3 | ||||
-rw-r--r-- | include/sys/zio_impl.h | 41 |
14 files changed, 225 insertions, 23 deletions
diff --git a/include/sys/bitmap.h b/include/sys/bitmap.h new file mode 100644 index 000000000..7b92507a7 --- /dev/null +++ b/include/sys/bitmap.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_BITMAP_H +#define _SYS_BITMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Operations on bitmaps of arbitrary size + * A bitmap is a vector of 1 or more ulong_t's. + * The user of the package is responsible for range checks and keeping + * track of sizes. + */ + +#ifdef _LP64 +#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ +#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#else +#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#endif + +#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ +#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ + +/* + * bitmap is a ulong_t *, bitindex an index_t + * + * The macros BT_WIM and BT_BIW internal; there is no need + * for users of this package to use them. + */ + +/* + * word in map + */ +#define BT_WIM(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT]) +/* + * bit in word + */ +#define BT_BIW(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK)) + +/* + * These are public macros + * + * BT_BITOUL == n bits to n ulong_t's + */ +#define BT_BITOUL(nbits) \ + (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) +#define BT_SIZEOFMAP(nbits) \ + (BT_BITOUL(nbits) * sizeof (ulong_t)) +#define BT_TEST(bitmap, bitindex) \ + ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) +#define BT_SET(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } +#define BT_CLEAR(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITMAP_H */ diff --git a/include/sys/brt.h b/include/sys/brt.h new file mode 100644 index 000000000..b1f701077 --- /dev/null +++ b/include/sys/brt.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#ifndef _SYS_BRT_H +#define _SYS_BRT_H + +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); + +extern uint64_t brt_get_dspace(spa_t *spa); +extern uint64_t brt_get_used(spa_t *spa); +extern uint64_t brt_get_saved(spa_t *spa); +extern uint64_t brt_get_ratio(spa_t *spa); + +extern boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp); +extern void brt_init(void); +extern void brt_fini(void); + +extern void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx); +extern void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx); +extern void brt_pending_apply(spa_t *spa, uint64_t txg); + +extern void brt_create(spa_t *spa); +extern int brt_load(spa_t *spa); +extern void brt_unload(spa_t *spa); +extern void brt_sync(spa_t *spa, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRT_H */ diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index a1ce76b1c..a06316362 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -172,6 +172,7 @@ typedef struct dbuf_dirty_record { override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; + boolean_t dr_brtwrite; boolean_t dr_has_raw_params; /* diff --git a/include/sys/ddt.h b/include/sys/ddt.h index d72401dcf..6378c042c 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -248,6 +248,8 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx); +extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); + extern const ddt_ops_t ddt_zap_ops; #ifdef __cplusplus diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 93de991cc..1b82ff620 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -782,6 +782,8 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); +void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, @@ -1059,6 +1061,12 @@ int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); +int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); +void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, + boolean_t replay); + /* * Initial setup and final teardown. */ diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 81e1ef6c1..ca8514e5d 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -90,6 +90,7 @@ enum dmu_tx_hold_type { THT_ZAP, THT_SPACE, THT_SPILL, + THT_CLONE, THT_NUMTYPES }; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e869685c5..25babd4ea 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -253,6 +253,9 @@ typedef enum { ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, + ZPOOL_PROP_BCLONEUSED, + ZPOOL_PROP_BCLONESAVED, + ZPOOL_PROP_BCLONERATIO, ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index cde08ec9b..8ccd58b58 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -380,6 +380,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 481209b24..a1dfef1d8 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -57,6 +57,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_TRIM (1 << 11) #define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) #define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) +#define ZFS_DEBUG_BRT (1 << 14) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index edff8f681..5da103f17 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -31,6 +31,10 @@ extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_holey(znode_t *, ulong_t, loff_t *); extern int zfs_access(znode_t *, int, int, cred_t *); +extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *, + uint64_t *, cred_t *); +extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, + const blkptr_t *, size_t); extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index fcee55b01..012e7403e 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -315,6 +315,9 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +extern void zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t offset, uint64_t length, uint64_t blksz, + const blkptr_t *bps, size_t nbps); extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern void zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index 9ac421043..cff8ebcad 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -166,7 +166,8 @@ typedef enum zil_create { #define TX_SETSAXATTR 21 /* Set sa xattrs on file */ #define TX_RENAME_EXCHANGE 22 /* Atomic swap via renameat2 */ #define TX_RENAME_WHITEOUT 23 /* Atomic whiteout via renameat2 */ -#define TX_MAX_TYPE 24 /* Max transaction type */ +#define TX_CLONE_RANGE 24 /* Clone a file range */ +#define TX_MAX_TYPE 25 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -176,9 +177,9 @@ typedef enum zil_create { #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* - * Transactions for write, truncate, setattr, acl_v0, and acl can be logged - * out of order. For convenience in the code, all such records must have - * lr_foid at the same offset. + * Transactions for operations below can be logged out of order. + * For convenience in the code, all such records must have lr_foid + * at the same offset. */ #define TX_OOO(txtype) \ ((txtype) == TX_WRITE || \ @@ -187,7 +188,8 @@ typedef enum zil_create { (txtype) == TX_ACL_V0 || \ (txtype) == TX_ACL || \ (txtype) == TX_WRITE2 || \ - (txtype) == TX_SETSAXATTR) + (txtype) == TX_SETSAXATTR || \ + (txtype) == TX_CLONE_RANGE) /* * The number of dnode slots consumed by the object is stored in the 8 @@ -387,6 +389,17 @@ typedef struct { /* lr_acl_bytes number of variable sized ace's follows */ } lr_acl_t; +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to clone into */ + uint64_t lr_offset; /* offset to clone to */ + uint64_t lr_length; /* length of the blocks to clone */ + uint64_t lr_blksz; /* file's block size */ + uint64_t lr_nbps; /* number of block pointers */ + blkptr_t lr_bps[]; + /* block pointers of the blocks to clone follows */ +} lr_clone_range_t; + /* * ZIL structure definitions, interface function prototype and globals. */ @@ -574,7 +587,7 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern uint64_t zil_max_copied_data(zilog_t *zilog); -extern uint64_t zil_max_log_data(zilog_t *zilog); +extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize); extern void zil_sums_init(zil_sums_t *zs); extern void zil_sums_fini(zil_sums_t *zs); diff --git a/include/sys/zio.h b/include/sys/zio.h index 28ed837d8..78603d0eb 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -348,6 +348,7 @@ typedef struct zio_prop { boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; + boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; @@ -556,7 +557,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, - boolean_t nopwrite); + boolean_t nopwrite, boolean_t brtwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 199cca291..29a05986c 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -77,6 +77,12 @@ extern "C" { * and zstd. Compression occurs as part of the write pipeline and is * performed in the ZIO_STAGE_WRITE_BP_INIT stage. * + * Block cloning: + * The block cloning functionality introduces ZIO_STAGE_BRT_FREE stage which + * is called during a free pipeline. If the block is referenced in the + * Block Cloning Table (BRT) we will just decrease its reference counter + * instead of actually freeing the block. + * * Dedup: * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing @@ -127,28 +133,30 @@ enum zio_stage { ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 9, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 12, /* --F-- */ + ZIO_STAGE_BRT_FREE = 1 << 9, /* --F-- */ + + ZIO_STAGE_DDT_READ_START = 1 << 10, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 13, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 13, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 14, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC- */ - ZIO_STAGE_DVA_THROTTLE = 1 << 15, /* -W--- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 16, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 17, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 18, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 18, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C- */ - ZIO_STAGE_READY = 1 << 19, /* RWFCI */ + ZIO_STAGE_READY = 1 << 20, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 20, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 21, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 22, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 23, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R---- */ - ZIO_STAGE_DONE = 1 << 24 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 25 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -233,6 +241,7 @@ enum zio_stage { #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_BRT_FREE | \ ZIO_STAGE_DVA_FREE) #define ZIO_DDT_FREE_PIPELINE \ |