diff options
author | Rob Norris <[email protected]> | 2023-06-22 17:46:22 +1000 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2024-08-16 12:03:35 -0700 |
commit | cd69ba3d49cdb939cba87e7fd6814608532df92f (patch) | |
tree | 32d51c27ae62b5145e5b9fd99b00930ff3c22f95 | |
parent | cbb9ef0a4c8e04358f7d5ddae0eb99d0f703ee21 (diff) |
ddt: dedup log
Adds a log/journal to dedup. At the end of txg, instead of writing the
entry directly to the ZAP, instead its adding to an in-memory tree and
appended to an on-disk object. The on-disk object is only read at
import, to reload the in-memory tree.
Lookups first go the the log tree before going to the ZAP, so
recently-used entries will remain close by in memory. This vastly
reduces overhead from dedup IO, as it will not have to do so many
read/update/write cycles on ZAP leaf nodes.
A flushing facility is added at end of txg, to push logged entries out
to the ZAP. There's actually two separate "logs" (in-memory tree and
on-disk object), one active (recieving updated entries) and one flushing
(writing out to disk). These are swapped (ie flushing begins) based on
memory used by the in-memory log trees and time since we last flushed
something.
The flushing facility monitors the amount of entries coming in and being
flushed out, and calibrates itself to try to flush enough each txg to
keep up with the ingest rate without competing too much with other IO.
Multiple tuneables are provided to control the flushing facility.
All the histograms and stats are update to accomodate the log as a
separate entry store. zdb gains knowledge of how to count them and dump
them. Documentation included!
Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Co-authored-by: Allan Jude <[email protected]>
Signed-off-by: Rob Norris <[email protected]>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
-rw-r--r-- | cmd/zdb/zdb.c | 33 | ||||
-rw-r--r-- | include/sys/ddt.h | 39 | ||||
-rw-r--r-- | include/sys/ddt_impl.h | 131 | ||||
-rw-r--r-- | include/sys/dmu.h | 1 | ||||
-rw-r--r-- | lib/libzpool/Makefile.am | 1 | ||||
-rw-r--r-- | man/man4/zfs.4 | 82 | ||||
-rw-r--r-- | module/Kbuild.in | 1 | ||||
-rw-r--r-- | module/Makefile.bsd | 2 | ||||
-rw-r--r-- | module/zfs/ddt.c | 604 | ||||
-rw-r--r-- | module/zfs/ddt_log.c | 760 | ||||
-rw-r--r-- | module/zfs/ddt_stats.c | 9 | ||||
-rw-r--r-- | tests/zfs-tests/include/tunables.cfg | 1 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh | 18 |
17 files changed, 1600 insertions, 110 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 250052adf..c72df3909 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1959,6 +1959,32 @@ dump_dedup_ratio(const ddt_stat_t *dds) } static void +dump_ddt_log(ddt_t *ddt) +{ + for (int n = 0; n < 2; n++) { + ddt_log_t *ddl = &ddt->ddt_log[n]; + + uint64_t count = avl_numnodes(&ddl->ddl_tree); + if (count == 0) + continue; + + printf(DMU_POOL_DDT_LOG ": %lu log entries\n", + zio_checksum_table[ddt->ddt_checksum].ci_name, n, count); + + if (dump_opt['D'] < 4) + continue; + + ddt_lightweight_entry_t ddlwe; + uint64_t index = 0; + for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) { + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + dump_ddt_entry(ddt, &ddlwe, index++); + } + } +} + +static void dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; @@ -2027,6 +2053,7 @@ dump_all_ddts(spa_t *spa) dump_ddt(ddt, type, class); } } + dump_ddt_log(ddt); } ddt_get_dedup_stats(spa, &dds_total); @@ -5743,7 +5770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, (void *)(((uintptr_t)dde->dde_io) | (1 << v)); /* Consume a reference for this block. */ - VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0); + VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0); ddt_phys_decref(dde->dde_phys, v); /* @@ -8120,6 +8147,10 @@ dump_mos_leaks(spa_t *spa) /* FDT container */ mos_obj_refd(ddt->ddt_dir_object); + + /* FDT log objects */ + mos_obj_refd(ddt->ddt_log[0].ddl_object); + mos_obj_refd(ddt->ddt_log[1].ddl_object); } if (spa->spa_brt != NULL) { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 2dd18526d..2fc798725 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -43,7 +43,8 @@ struct abd; * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). */ #define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ -#define DDT_FLAG_MASK (DDT_FLAG_FLAT) +#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */ +#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG) /* * DDT on-disk storage object types. Each one corresponds to specific @@ -209,6 +210,7 @@ typedef enum { /* State flags for dde_flags */ #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ +#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ /* * Additional data to support entry update or repair. This is fixed size @@ -255,6 +257,19 @@ typedef struct { } ddt_lightweight_entry_t; /* + * In-core DDT log. A separate struct to make it easier to switch between the + * appending and flushing logs. + */ +typedef struct { + avl_tree_t ddl_tree; /* logged entries */ + uint32_t ddl_flags; /* flags for this log */ + uint64_t ddl_object; /* log object id */ + uint64_t ddl_length; /* on-disk log size */ + uint64_t ddl_first_txg; /* txg log became active */ + ddt_key_t ddl_checkpoint; /* last checkpoint */ +} ddt_log_t; + +/* * In-core DDT object. This covers all entries and stats for a the whole pool * for a given checksum type. */ @@ -262,8 +277,22 @@ typedef struct { kmutex_t ddt_lock; /* protects changes to all fields */ avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ + avl_tree_t ddt_log_tree; /* logged entries */ - avl_tree_t ddt_repair_tree; /* entries being repaired */ + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + ddt_log_t ddt_log[2]; /* active/flushing logs */ + ddt_log_t *ddt_log_active; /* pointers into ddt_log */ + ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ + + hrtime_t ddt_flush_start; /* log flush start this txg */ + uint32_t ddt_flush_pass; /* log flush pass this txg */ + + int32_t ddt_flush_count; /* entries flushed this txg */ + int32_t ddt_flush_min; /* min rem entries to flush */ + int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ + int32_t ddt_log_flush_rate; /* rolling log flush rate */ + int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ enum zio_checksum ddt_checksum; /* checksum algorithm in use */ spa_t *ddt_spa; /* pool this ddt is on */ @@ -276,13 +305,17 @@ typedef struct { /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; - /* object ids for whole-ddt and per-type/per-class stats */ + /* object ids for stored, logged and per-type/per-class stats */ uint64_t ddt_stat_object; + ddt_object_t ddt_log_stats; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + + /* log stats power-2-sized referenced blocks */ + ddt_histogram_t ddt_log_histogram; } ddt_t; /* diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index ce4bc559d..6f11cd90c 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -28,6 +28,7 @@ #define _SYS_DDT_IMPL_H #include <sys/ddt.h> +#include <sys/bitops.h> #ifdef __cplusplus extern "C" { @@ -50,6 +51,106 @@ extern "C" { memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ } while (0) +#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (ddle)->ddle_key; \ + (ddlwe)->ddlwe_type = (ddle)->ddle_type; \ + (ddlwe)->ddlwe_class = (ddle)->ddle_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \ +} while (0) + +/* + * An entry on the log tree. These are "frozen", and a record of what's in + * the on-disk log. They can't be used in place, but can be "loaded" back into + * the live tree. + */ +typedef struct { + ddt_key_t ddle_key; /* ddt_log_tree key */ + avl_node_t ddle_node; /* ddt_log_tree node */ + + ddt_type_t ddle_type; /* storage type */ + ddt_class_t ddle_class; /* storage class */ + + /* extra allocation for flat/trad phys */ + ddt_univ_phys_t ddle_phys[]; +} ddt_log_entry_t; + +/* On-disk log record types. */ +typedef enum { + DLR_INVALID = 0, /* end of block marker */ + DLR_ENTRY = 1, /* an entry to add or replace in the log tree */ +} ddt_log_record_type_t; + +/* On-disk log record header. */ +typedef struct { + /* + * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to + * access it. + * + * bits 0-7: record type (ddt_log_record_type_t) + * bits 8-15: length of record header+payload + * bits 16-47: reserved, all zero + * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type) + * otherwise all zero + * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class) + * otherwise all zero + */ + uint64_t dlr_info; + uint8_t dlr_payload[]; +} ddt_log_record_t; + +#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8) +#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v) +#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16) +#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v) +#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8) +#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v) +#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8) +#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v) + +/* Payload for DLR_ENTRY. */ +typedef struct { + ddt_key_t dlre_key; + ddt_univ_phys_t dlre_phys[]; +} ddt_log_record_entry_t; + +/* Log flags (ddl_flags, dlh_flags) */ +#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */ +#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */ + +/* On-disk log header, stored in the bonus buffer. */ +typedef struct { + /* + * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to + * access it. + * + * bits 0-7: log version + * bits 8-15: log flags + * bits 16-63: reserved, all zero + */ + uint64_t dlh_info; + + uint64_t dlh_length; /* log size in bytes */ + uint64_t dlh_first_txg; /* txg this log went active */ + ddt_key_t dlh_checkpoint; /* last checkpoint */ +} ddt_log_header_t; + +#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8) +#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v) +#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8) +#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v) + +/* DDT log update state */ +typedef struct { + dmu_tx_t *dlu_tx; /* tx the update is being applied to */ + dnode_t *dlu_dn; /* log object dnode */ + dmu_buf_t **dlu_dbp; /* array of block buffer pointers */ + int dlu_ndbp; /* number of block buffer pointers */ + uint16_t dlu_reclen; /* cached length of record */ + uint64_t dlu_block; /* block for next entry */ + uint64_t dlu_offset; /* offset for next entry */ +} ddt_log_update_t; + /* * Ops vector to access a specific DDT object type. */ @@ -77,6 +178,33 @@ typedef struct { extern const ddt_ops_t ddt_zap_ops; +/* Dedup log API */ +extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, + ddt_log_update_t *dlu); +extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde, + ddt_log_update_t *dlu); +extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); + +extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, + ddt_lightweight_entry_t *ddlwe); +extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, + const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe); + +extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, + dmu_tx_t *tx); +extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx); + +extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx); + +extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx); + +extern int ddt_log_load(ddt_t *ddt); +extern void ddt_log_alloc(ddt_t *ddt); +extern void ddt_log_free(ddt_t *ddt); + +extern void ddt_log_init(void); +extern void ddt_log_fini(void); + /* * These are only exposed so that zdb can access them. Try not to use them * outside of the DDT implementation proper, and if you do, consider moving @@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops; */ #define DDT_NAMELEN 32 -extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde); +extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, + const ddt_univ_phys_t *ddp); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 5b80dc315..928f5f2b4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -375,6 +375,7 @@ typedef struct dmu_buf { #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_LOG "DDT-log-%s-%u" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_DDT_DIR "DDT-%s" #define DMU_POOL_CREATION_VERSION "creation_version" diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 42f3404db..070dc0132 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/dbuf.c \ module/zfs/dbuf_stats.c \ module/zfs/ddt.c \ + module/zfs/ddt_log.c \ module/zfs/ddt_stats.c \ module/zfs/ddt_zap.c \ module/zfs/dmu.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 45b6c338a..aae3d7dfb 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -974,6 +974,88 @@ milliseconds until the operation completes. .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . +.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint +Maximum number of dedup log flush passes (iterations) each transaction. +.Pp +At the start of each transaction, OpenZFS will estimate how many entries it +needs to flush out to keep up with the change rate, taking the amount and time +taken to flush on previous txgs into account (see +.Sy zfs_dedup_log_flush_flow_rate_txgs ) . +It will spread this amount into a number of passes. +At each pass, it will use the amount already flushed and the total time taken +by flushing and by other IO to recompute how much it should do for the remainder +of the txg. +.Pp +Reducing the max number of passes will make flushing more aggressive, flushing +out more entries on each pass. +This can be faster, but also more likely to compete with other IO. +Increasing the max number of passes will put fewer entries onto each pass, +keeping the overhead of dedup changes to a minimum but possibly causing a large +number of changes to be dumped on the last pass, which can blow out the txg +sync time beyond +.Sy zfs_txg_timeout . +. +.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint +Minimum time to spend on dedup log flush each transaction. +.Pp +At least this long will be spent flushing dedup log entries each transaction, +up to +.Sy zfs_txg_timeout . +This occurs even if doing so would delay the transaction, that is, other IO +completes under this time. +. +.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint +Flush at least this many entries each transaction. +.Pp +OpenZFS will estimate how many entries it needs to flush each transaction to +keep up with the ingest rate (see +.Sy zfs_dedup_log_flush_flow_rate_txgs ) . +This sets the minimum for that estimate. +Raising it can force OpenZFS to flush more aggressively, keeping the log small +and so reducing pool import times, but can make it less able to back off if +log flushing would compete with other IO too much. +. +.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint +Number of transactions to use to compute the flow rate. +.Pp +OpenZFS will estimate how many entries it needs to flush each transaction by +monitoring the number of entries changed (ingest rate), number of entries +flushed (flush rate) and time spent flushing (flush time rate) and combining +these into an overall "flow rate". +It will use an exponential weighted moving average over some number of recent +transactions to compute these rates. +This sets the number of transactions to compute these averages over. +Setting it higher can help to smooth out the flow rate in the face of spiky +workloads, but will take longer for the flow rate to adjust to a sustained +change in the ingress rate. +. +.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint +Max transactions to before starting to flush dedup logs. +.Pp +OpenZFS maintains two dedup logs, one receiving new changes, one flushing. +If there is nothing to flush, it will accumulate changes for no more than this +many transactions before switching the logs and starting to flush entries out. +. +.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64 +Max memory to use for dedup logs. +.Pp +OpenZFS will spend no more than this much memory on maintaining the in-memory +dedup log. +Flushing will begin when around half this amount is being spent on logs. +The default value of +.Sy 0 +will cause it to be set by +.Sy zfs_dedup_log_mem_max_percent +instead. +. +.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint +Max memory to use for dedup logs, as a percentage of total memory. +.Pp +If +.Sy zfs_dedup_log_mem_max +is not set, it will be initialised as a percentage of the total memory in the +system. +. .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of diff --git a/module/Kbuild.in b/module/Kbuild.in index 57682214d..a119198db 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -322,6 +322,7 @@ ZFS_OBJS := \ dbuf.o \ dbuf_stats.o \ ddt.o \ + ddt_log.o \ ddt_stats.o \ ddt_zap.o \ dmu.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index d9d31564d..534f32571 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -252,6 +252,7 @@ SRCS+= abd.c \ dbuf.c \ dbuf_stats.c \ ddt.c \ + ddt_log.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ @@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast CFLAGS.abd.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.ddt_zap.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 26e127d61..ce5c4efb5 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -125,6 +125,28 @@ * without which, no space would be recovered and the DDT would continue to be * considered "over quota". See zap_shrink_enabled. * + * ## Dedup log + * + * Historically, all entries modified on a txg were written back to dedup + * storage objects at the end of every txg. This could cause significant + * overheads, as each entry only takes up a tiny portion of a ZAP leaf node, + * and so required reading the whole node, updating the entry, and writing it + * back. On busy pools, this could add serious IO and memory overheads. + * + * To address this, the dedup log was added. If the "fast_dedup" feature is + * enabled, at the end of each txg, modified entries will be copied to an + * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the + * same block is requested again, the in-memory object will be checked first, + * and if its there, the entry inflated back onto the live tree without going + * to storage. The on-disk log is only read at pool import time, to reload the + * in-memory log. + * + * Each txg, some amount of the in-memory log will be flushed out to a DDT + * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to + * keep up with the rate of change on dedup entries, but not so much that it + * would impact overall throughput, and not using too much memory. See the + * zfs_dedup_log_* tuneables in zfs(4) for more details. + * * ## Repair IO * * If a read on a dedup block fails, but there are other copies of the block in @@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0; uint_t dedup_class_wait_txgs = 5; +/* + * Don't do more than this many incremental flush passes per txg. + */ +uint_t zfs_dedup_log_flush_passes_max = 8; + +/* + * Minimum time to flush per txg. + */ +uint_t zfs_dedup_log_flush_min_time_ms = 1000; + +/* + * Minimum entries to flush per txg. + */ +uint_t zfs_dedup_log_flush_entries_min = 1000; + +/* + * Number of txgs to average flow rates across. + */ +uint_t zfs_dedup_log_flush_flow_rate_txgs = 10; + static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; @@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = { */ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_LEGACY] = 0, - [DDT_VERSION_FDT] = DDT_FLAG_FLAT, + [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, }; /* Dummy version to signal that configure is still necessary */ @@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) static int ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - ddt_entry_t *dde, dmu_tx_t *tx) + const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], &dde->dde_key, - dde->dde_phys, DDT_PHYS_SIZE(ddt), tx)); + ddt->ddt_object[type][class], &ddlwe->ddlwe_key, + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx)); } static int @@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) } uint64_t -ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde) +ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp) { uint64_t refcnt = 0; - if (ddt->ddt_flags & DDT_FLAG_FLAT) { - refcnt = dde->dde_phys->ddp_flat.ddp_refcnt; - } else { - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) - refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt; - } + if (ddt->ddt_flags & DDT_FLAG_FLAT) + refcnt = ddp->ddp_flat.ddp_refcnt; + else + for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++) + refcnt += ddp->ddp_trad[v].ddp_refcnt; return (refcnt); } @@ -743,11 +784,15 @@ ddt_init(void) DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + + ddt_log_init(); } void ddt_fini(void) { + ddt_log_fini(); + kmem_cache_destroy(ddt_entry_trad_cache); kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); @@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) { ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + /* Entry is still in the log, so charge the entry back to it */ + if (dde->dde_flags & DDE_FLAG_LOGGED) { + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + } + avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); } @@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) avl_insert(&ddt->ddt_tree, dde, where); + /* If its in the log tree, we can "load" it from there */ + if (ddt->ddt_flags & DDT_FLAG_LOG) { + ddt_lightweight_entry_t ddlwe; + + if (ddt_log_take_key(ddt, ddt->ddt_log_active, + &search, &ddlwe) || + ddt_log_take_key(ddt, ddt->ddt_log_flushing, + &search, &ddlwe)) { + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + + dde->dde_type = ddlwe.ddlwe_type; + dde->dde_class = ddlwe.ddlwe_class; + memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, + DDT_PHYS_SIZE(ddt)); + + return (dde); + } + } + /* * ddt_tree is now stable, so unlock and let everyone else keep moving. * Anyone landing on this entry will find it without DDE_FLAG_LOADED, @@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { /* - * The histograms only track inactive (stored) blocks. + * The histograms only track inactive (stored or logged) blocks. * We've just put an entry onto the live list, so we need to * remove its counts. When its synced back, it'll be re-added * to the right one. + * + * We only do this when we successfully found it in the store. + * error == ENOENT means this is a new entry, and so its already + * not counted. */ ddt_histogram_t *ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; @@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) } } + ddt_log_destroy(ddt, tx); + uint64_t count; ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, @@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); memset(ddt, 0, sizeof (ddt_t)); - mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); avl_create(&ddt->ddt_repair_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt_log_alloc(ddt); + return (ddt); } static void ddt_table_free(ddt_t *ddt) { + ddt_log_free(ddt); ASSERT0(avl_numnodes(&ddt->ddt_tree)); ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); avl_destroy(&ddt->ddt_tree); @@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa) } } + error = ddt_log_load(ddt); + if (error != 0 && error != ENOENT) + return (error); + /* * Seed the cached histograms. */ @@ -1483,145 +1567,447 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) } static void -ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx) +{ + /* + * Count all the entries stored for each type/class, and updates the + * stats within (ddt_object_sync()). If there's no entries for the + * type/class, the whole object is removed. If all objects for the DDT + * are removed, its containing dir is removed, effectively resetting + * the entire DDT to an empty slate. + */ + uint64_t count = 0; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + uint64_t add, tcount = 0; + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + VERIFY0(ddt_object_count(ddt, type, class, + &add)); + tcount += add; + } + } + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + if (tcount == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + count += tcount; + } + + if (ddt->ddt_flags & DDT_FLAG_LOG) { + /* Include logged entries in the total count */ + count += avl_numnodes(&ddt->ddt_log_active->ddl_tree); + count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + } + + if (count == 0) { + /* + * No entries left on the DDT, so reset the version for next + * time. This allows us to handle the feature being changed + * since the DDT was originally created. New entries should get + * whatever the feature currently demands. + */ + if (ddt->ddt_version == DDT_VERSION_FDT) + ddt_destroy_dir(ddt, tx); + + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_flags = 0; + } + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; +} + +static void +ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; - ddt_key_t *ddk = &dde->dde_key; - ddt_type_t otype = dde->dde_type; - ddt_type_t ntype = DDT_TYPE_DEFAULT; - ddt_class_t oclass = dde->dde_class; - ddt_class_t nclass; - uint64_t total_refcnt = 0; - ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Compute the target class, so we can decide whether or not to inform + * the scrub traversal (below). Note that we don't store this in the + * entry, as it might change multiple times before finally being + * committed (if we're logging). Instead, we recompute it in + * ddt_sync_entry(). + */ + uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys); + ddt_class_t nclass = + (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + + /* + * If the class changes, the order that we scan this bp changes. If it + * decreases, we could miss it, so scan it right now. (This covers both + * class changing while we are doing ddt_walk(), and when we are + * traversing.) + * + * We also do this when the refcnt goes to zero, because that change is + * only in the log so far; the blocks on disk won't be freed until + * the log is flushed, and the refcnt might increase before that. If it + * does, then we could miss it in the same way. + */ + if (refcnt == 0 || nclass < ddlwe->ddlwe_class) + dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt, + ddlwe, tx); +} + +static void +ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, + ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx) +{ + ddt_key_t *ddk = &ddlwe->ddlwe_key; + ddt_type_t ntype = DDT_TYPE_DEFAULT; + uint64_t refcnt = 0; + /* + * Compute the total refcnt. Along the way, issue frees for any DVAs + * we no longer want. + */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ASSERT(dde->dde_io == NULL || - dde->dde_io->dde_lead_zio[p] == NULL); - ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); if (ddt_phys_birth(ddp, v) == 0) { - ASSERT0(phys_refcnt); + ASSERT3U(phys_refcnt, ==, 0); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { /* - * Note, we no longer create DDT-DITTO blocks, but we - * don't want to leak any written by older software. + * We don't want to keep any obsolete slots (eg ditto), + * regardless of their refcount, but we don't want to + * leak them either. So, free them. */ - ddt_phys_free(ddt, ddk, ddp, v, txg); + ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); continue; } if (phys_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, v, txg); - total_refcnt += phys_refcnt; + /* No remaining references, free it! */ + ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); + refcnt += phys_refcnt; } - if (total_refcnt > 1) - nclass = DDT_CLASS_DUPLICATE; - else - nclass = DDT_CLASS_UNIQUE; + /* Select the best class for the entry. */ + ddt_class_t nclass = + (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + /* + * If an existing entry changed type or class, or its refcount reached + * zero, delete it from the DDT object + */ if (otype != DDT_TYPES && - (otype != ntype || oclass != nclass || total_refcnt == 0)) { + (otype != ntype || oclass != nclass || refcnt == 0)) { VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); - ASSERT3U( - ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); + ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT); } - if (total_refcnt != 0) { - dde->dde_type = ntype; - dde->dde_class = nclass; + /* + * Add or update the entry + */ + if (refcnt != 0) { + ddt_histogram_t *ddh = + &ddt->ddt_histogram[ntype][nclass]; + + ddt_histogram_add_entry(ddt, ddh, ddlwe); if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); - VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); + VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx)); + } +} - ddt_lightweight_entry_t ddlwe; - DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); +/* Calculate an exponential weighted moving average, lower limited to zero */ +static inline int32_t +_ewma(int32_t val, int32_t prev, uint32_t weight) +{ + ASSERT3U(val, >=, 0); + ASSERT3U(prev, >=, 0); + const int32_t new = + MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1)); + ASSERT3U(new, >=, 0); + return (new); +} - ddt_histogram_t *ddh = - &ddt->ddt_histogram[ntype][nclass]; - ddt_histogram_add_entry(ddt, ddh, &ddlwe); +/* Returns true if done for this txg */ +static boolean_t +ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) +{ + if (ddt->ddt_flush_pass == 0) { + if (spa_sync_pass(ddt->ddt_spa) == 1) { + /* First run this txg, get set up */ + ddt->ddt_flush_start = gethrtime(); + ddt->ddt_flush_count = 0; + /* + * How many entries we need to flush. We want to at + * least match the ingest rate. + */ + ddt->ddt_flush_min = MAX( + ddt->ddt_log_ingest_rate, + zfs_dedup_log_flush_entries_min); + } else { + /* We already decided we're done for this txg */ + return (B_FALSE); + } + } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { /* - * If the class changes, the order that we scan this bp - * changes. If it decreases, we could miss it, so - * scan it right now. (This covers both class changing - * while we are doing ddt_walk(), and when we are - * traversing.) + * We already did some flushing on this pass, skip it. This + * happens when dsl_process_async_destroys() runs during a scan + * (on pass 1) and does an additional ddt_sync() to update + * freed blocks. */ - if (nclass < oclass) { - dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, ddt, &ddlwe, tx); - } + return (B_FALSE); } + + if (spa_sync_pass(ddt->ddt_spa) > + MAX(zfs_dedup_log_flush_passes_max, 1)) { + /* Too many passes this txg, defer until next. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* Nothing to flush, done for this txg. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? + MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), + SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); + + uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; + + if (elapsed_time >= target_time) { + /* Too long since we started, done for this txg. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + ddt->ddt_flush_pass++; + ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); + + /* + * Estimate how much time we'll need to flush the remaining entries + * based on how long it normally takes. + */ + uint32_t want_time; + if (ddt->ddt_flush_pass == 1) { + /* First pass, use the average time/entries */ + if (ddt->ddt_log_flush_rate == 0) + /* Zero rate, just assume the whole time */ + want_time = target_time; + else + want_time = ddt->ddt_flush_min * + ddt->ddt_log_flush_time_rate / + ddt->ddt_log_flush_rate; + } else { + /* Later pass, calculate from this txg so far */ + want_time = ddt->ddt_flush_min * + elapsed_time / ddt->ddt_flush_count; + } + + /* Figure out how much time we have left */ + uint32_t remain_time = target_time - elapsed_time; + + /* Smear the remaining entries over the remaining passes. */ + uint32_t nentries = ddt->ddt_flush_min / + (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); + if (want_time > remain_time) { + /* + * We're behind; try to catch up a bit by doubling the amount + * this pass. If we're behind that means we're in a later + * pass and likely have most of the remaining time to + * ourselves. If we're in the last couple of passes, then + * doubling might just take us over the timeout, but probably + * not be much, and it stops us falling behind. If we're + * in the middle passes, there'll be more to do, but it + * might just help us catch up a bit and we'll recalculate on + * the next pass anyway. + */ + nentries = MIN(ddt->ddt_flush_min, nentries*2); + } + + ddt_lightweight_entry_t ddlwe; + uint32_t count = 0; + while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { + ddt_sync_flush_entry(ddt, &ddlwe, + ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); + + /* End this pass if we've synced as much as we need to. */ + if (++count >= nentries) + break; + } + ddt->ddt_flush_count += count; + ddt->ddt_flush_min -= count; + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* We emptied it, so truncate on-disk */ + ddt_log_truncate(ddt, tx); + /* No more passes needed this txg */ + ddt->ddt_flush_pass = 0; + } else + /* More to do next time, save checkpoint */ + ddt_log_checkpoint(ddt, &ddlwe, tx); + + ddt_sync_update_stats(ddt, tx); + + return (ddt->ddt_flush_pass == 0); } static void -ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) { - spa_t *spa = ddt->ddt_spa; - ddt_entry_t *dde; - void *cookie = NULL; + ASSERT(avl_is_empty(&ddt->ddt_tree)); - if (avl_numnodes(&ddt->ddt_tree) == 0) + /* Don't do any flushing when the pool is ready to shut down */ + if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) return; - ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); + /* Try to flush some. */ + if (!ddt_sync_flush_log_incremental(ddt, tx)) + /* More to do next time */ + return; - if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, tx); + /* No more flushing this txg, so we can do end-of-txg housekeeping */ + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && + !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { + /* + * No more to flush, and the active list has stuff, so + * try to swap the logs for next time. + */ + (void) ddt_log_swap(ddt, tx); } - if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) - ddt_create_dir(ddt, tx); + /* + * Update flush rate. This is an exponential weighted moving average of + * the number of entries flushed over recent txgs. + */ + ddt->ddt_log_flush_rate = _ewma( + ddt->ddt_flush_count, ddt->ddt_log_flush_rate, + zfs_dedup_log_flush_flow_rate_txgs); - while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { - ddt_sync_entry(ddt, dde, tx, txg); - ddt_free(ddt, dde); - } + /* + * Update flush time rate. This is an exponential weighted moving + * average of the total time taken to flush over recent txgs. + */ + ddt->ddt_log_flush_time_rate = _ewma( + ddt->ddt_log_flush_time_rate, + ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), + zfs_dedup_log_flush_flow_rate_txgs); +} - uint64_t count = 0; - for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - uint64_t add, tcount = 0; - for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (ddt_object_exists(ddt, type, class)) { - ddt_object_sync(ddt, type, class, tx); - VERIFY0(ddt_object_count(ddt, type, class, - &add)); - tcount += add; - } +static void +ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx) +{ + uint64_t count = avl_numnodes(&ddt->ddt_tree); + + if (count > 0) { + ddt_log_update_t dlu = {0}; + ddt_log_begin(ddt, count, tx, &dlu); + + ddt_entry_t *dde; + void *cookie = NULL; + ddt_lightweight_entry_t ddlwe; + while ((dde = + avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_log_entry(ddt, &ddlwe, &dlu); + ddt_sync_scan_entry(ddt, &ddlwe, tx); + ddt_free(ddt, dde); } - for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (tcount == 0 && ddt_object_exists(ddt, type, class)) - ddt_object_destroy(ddt, type, class, tx); + + ddt_log_commit(ddt, &dlu); + + /* + * Sync the stats for the store objects. Even though we haven't + * modified anything on those objects, they're no longer the + * source of truth for entries that are now in the log, and we + * need the on-disk counts to reflect that, otherwise we'll + * miscount later when importing. + */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; + class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) + ddt_object_sync(ddt, type, class, tx); + } } - count += tcount; + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; } - if (count == 0) { + if (spa_sync_pass(ddt->ddt_spa) == 1) /* - * No entries left on the DDT, so reset the version for next - * time. This allows us to handle the feature being changed - * since the DDT was originally created. New entries should get - * whatever the feature currently demands. + * Update ingest rate. This is an exponential weighted moving + * average of the number of entries changed over recent txgs. + * The ramp-up cost shouldn't matter too much because the + * flusher will be trying to take at least the minimum anyway. */ - if (ddt->ddt_version == DDT_VERSION_FDT) - ddt_destroy_dir(ddt, tx); + ddt->ddt_log_ingest_rate = _ewma( + count, ddt->ddt_log_ingest_rate, + zfs_dedup_log_flush_flow_rate_txgs); +} - ddt->ddt_version = DDT_VERSION_UNCONFIGURED; - ddt->ddt_flags = 0; +static void +ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx) +{ + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ddt_entry_t *dde; + void *cookie = NULL; + while ((dde = avl_destroy_nodes( + &ddt->ddt_tree, &cookie)) != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_sync_flush_entry(ddt, &ddlwe, + dde->dde_type, dde->dde_class, tx); + ddt_sync_scan_entry(ddt, &ddlwe, tx); + ddt_free(ddt, dde); } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); - spa->spa_dedup_dspace = ~0ULL; - spa->spa_dedup_dsize = ~0ULL; + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; + ddt_sync_update_stats(ddt, tx); +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + + if (ddt->ddt_version == UINT64_MAX) + return; + + if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) { + ASSERT0(avl_numnodes(&ddt->ddt_tree)); + return; + } + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); + } + + if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) + ddt_create_dir(ddt, tx); + + if (ddt->ddt_flags & DDT_FLAG_LOG) + ddt_sync_table_log(ddt, tx); + else + ddt_sync_table_flush(ddt, tx); } void @@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg) ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) continue; - ddt_sync_table(ddt, tx, txg); + ddt_sync_table(ddt, tx); + if (ddt->ddt_flags & DDT_FLAG_LOG) + ddt_sync_flush_log(ddt, tx); ddt_repair_table(ddt, rio); } @@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (B_FALSE); } - if (dde->dde_type < DDT_TYPES) { - ASSERT3S(dde->dde_class, <, DDT_CLASSES); - + if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) { + /* + * This entry was either synced to a store object (dde_type is + * real) or was logged. It must be properly on disk at this + * point, so we can just bump its refcount. + */ int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); @@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) * we may have a block with the DEDUP set, but which doesn't * have a corresponding entry in the DDT. Be ready. */ - ASSERT3S(dde->dde_class, ==, DDT_CLASSES); ddt_remove(ddt, dde); result = B_FALSE; } @@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, + "Max number of incremental dedup log flush passes per transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, + "Min time to spend on incremental dedup log flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, + "Min number of log entries to flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, + "Number of txgs to average flow rates across"); diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c new file mode 100644 index 000000000..7e7ff9e5b --- /dev/null +++ b/module/zfs/ddt_log.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023, Klara Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/ddt.h> +#include <sys/dmu_tx.h> +#include <sys/dmu.h> +#include <sys/ddt_impl.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> + +/* + * No more than this many txgs before swapping logs. + */ +uint_t zfs_dedup_log_txg_max = 8; + +/* + * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module + * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. + */ +uint64_t zfs_dedup_log_mem_max = 0; +uint_t zfs_dedup_log_mem_max_percent = 1; + + +static kmem_cache_t *ddt_log_entry_flat_cache; +static kmem_cache_t *ddt_log_entry_trad_cache; + +#define DDT_LOG_ENTRY_FLAT_SIZE \ + (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_LOG_ENTRY_TRAD_SIZE \ + (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_LOG_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) + +void +ddt_log_init(void) +{ + ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", + DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", + DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * Max memory for log AVL entries. At least 1M, because we need + * something (that's ~3800 entries per tree). They can say 100% if they + * want; it just means they're at the mercy of the the txg flush limit. + */ + if (zfs_dedup_log_mem_max == 0) { + zfs_dedup_log_mem_max_percent = + MIN(zfs_dedup_log_mem_max_percent, 100); + zfs_dedup_log_mem_max = (physmem * PAGESIZE) * + zfs_dedup_log_mem_max_percent / 100; + } + zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); +} + +void +ddt_log_fini(void) +{ + kmem_cache_destroy(ddt_log_entry_trad_cache); + kmem_cache_destroy(ddt_log_entry_flat_cache); +} + +static void +ddt_log_name(ddt_t *ddt, char *name, uint_t n) +{ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, + zio_checksum_table[ddt->ddt_checksum].ci_name, n); +} + +static void +ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) +{ + dmu_buf_t *db; + VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; + DLH_SET_VERSION(hdr, 1); + DLH_SET_FLAGS(hdr, ddl->ddl_flags); + hdr->dlh_length = ddl->ddl_length; + hdr->dlh_first_txg = ddl->ddl_first_txg; + hdr->dlh_checkpoint = ddl->ddl_checkpoint; + + dmu_buf_rele(db, FTAG); +} + +static void +ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, >, 0); + ASSERT3U(ddl->ddl_object, ==, 0); + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, + sizeof (uint64_t), 1, &ddl->ddl_object, tx)); + ddl->ddl_length = 0; + ddl->ddl_first_txg = tx->tx_txg; + ddt_log_update_header(ddt, ddl, tx); +} + +static void +ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); + ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); +} + +static void +ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, >, 0); + + if (ddl->ddl_object == 0) + return; + + ASSERT0(ddl->ddl_length); + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); + VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); + + ddl->ddl_object = 0; +} + +void +ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); + ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); +} + +static void +ddt_log_update_stats(ddt_t *ddt) +{ + /* + * Log object stats. We count the number of live entries in the log + * tree, even if there are more than on disk, and even if the same + * entry is on both append and flush trees, because that's more what + * the user expects to see. This does mean the on-disk size is not + * really correlated with the number of entries, but I don't think + * that's reasonable to expect anyway. + */ + dmu_object_info_t doi; + uint64_t nblocks; + dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); + nblocks = doi.doi_physical_blocks_512; + dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); + nblocks += doi.doi_physical_blocks_512; + + ddt_object_t *ddo = &ddt->ddt_log_stats; + ddo->ddo_count = + avl_numnodes(&ddt->ddt_log_active->ddl_tree) + + avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); + ddo->ddo_dspace = nblocks << 9; +} + +void +ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) +{ + ASSERT3U(nentries, >, 0); + ASSERT3P(dlu->dlu_dbp, ==, NULL); + + if (ddt->ddt_log_active->ddl_object == 0) + ddt_log_create(ddt, tx); + + /* + * We want to store as many entries as we can in a block, but never + * split an entry across block boundaries. + */ + size_t reclen = P2ALIGN_TYPED( + sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + + DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); + ASSERT3U(reclen, <=, UINT16_MAX); + dlu->dlu_reclen = reclen; + + VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, + &dlu->dlu_dn)); + dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); + + uint64_t nblocks = howmany(nentries, + dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); + uint64_t offset = ddt->ddt_log_active->ddl_length; + uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; + + VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, + B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, + DMU_READ_NO_PREFETCH)); + + dlu->dlu_tx = tx; + dlu->dlu_block = dlu->dlu_offset = 0; +} + +static ddt_log_entry_t * +ddt_log_alloc_entry(ddt_t *ddt) +{ + ddt_log_entry_t *ddle; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); + memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); + } else { + ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); + memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); + } + + return (ddle); +} + +static void +ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +{ + /* Create the log tree entry from a live or stored entry */ + avl_index_t where; + ddt_log_entry_t *ddle = + avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); + if (ddle == NULL) { + ddle = ddt_log_alloc_entry(ddt); + ddle->ddle_key = ddlwe->ddlwe_key; + avl_insert(&ddl->ddl_tree, ddle, where); + } + ddle->ddle_type = ddlwe->ddlwe_type; + ddle->ddle_class = ddlwe->ddlwe_class; + memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); +} + +void +ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) +{ + ASSERT3U(dlu->dlu_dbp, !=, NULL); + + ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + /* Get our block */ + ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); + dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; + + /* + * If this would take us past the end of the block, finish it and + * move to the next one. + */ + if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { + ASSERT3U(dlu->dlu_offset, >, 0); + dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); + dlu->dlu_block++; + dlu->dlu_offset = 0; + ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); + db = dlu->dlu_dbp[dlu->dlu_block]; + } + + /* + * If this is the first time touching the block, inform the DMU that + * we will fill it, and zero it out. + */ + if (dlu->dlu_offset == 0) { + dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE); + memset(db->db_data, 0, db->db_size); + } + + /* Create the log record directly in the buffer */ + ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); + DLR_SET_TYPE(dlr, DLR_ENTRY); + DLR_SET_RECLEN(dlr, dlu->dlu_reclen); + DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); + DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); + + ddt_log_record_entry_t *dlre = + (ddt_log_record_entry_t *)&dlr->dlr_payload; + dlre->dlre_key = ddlwe->ddlwe_key; + memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); + + /* Advance offset for next record. */ + dlu->dlu_offset += dlu->dlu_reclen; +} + +void +ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) +{ + ASSERT3U(dlu->dlu_dbp, !=, NULL); + ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); + ASSERT3U(dlu->dlu_offset, >, 0); + + /* + * Close out the last block. Whatever we haven't used will be zeroed, + * which matches DLR_INVALID, so we can detect this during load. + */ + dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); + + dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); + + ddt->ddt_log_active->ddl_length += + dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; + dnode_rele(dlu->dlu_dn, FTAG); + + ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); + + memset(dlu, 0, sizeof (ddt_log_update_t)); + + ddt_log_update_stats(ddt); +} + +boolean_t +ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + if (ddle == NULL) + return (B_FALSE); + + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + avl_remove(&ddl->ddl_tree, ddle); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + + return (B_TRUE); +} + +boolean_t +ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); + if (ddle == NULL) + return (B_FALSE); + + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + avl_remove(&ddl->ddl_tree, ddle); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + + return (B_TRUE); +} + +void +ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) +{ + ddt_log_t *ddl = ddt->ddt_log_flushing; + + ASSERT3U(ddl->ddl_object, !=, 0); + +#ifdef ZFS_DEBUG + /* + * There should not be any entries on the log tree before the given + * checkpoint. Assert that this is the case. + */ + ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + if (ddle != NULL) + VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), + >, 0); +#endif + + ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; + ddl->ddl_checkpoint = ddlwe->ddlwe_key; + ddt_log_update_header(ddt, ddl, tx); + + ddt_log_update_stats(ddt); +} + +void +ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_t *ddl = ddt->ddt_log_flushing; + + if (ddl->ddl_object == 0) + return; + + ASSERT(avl_is_empty(&ddl->ddl_tree)); + + /* Eject the entire object */ + dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); + + ddl->ddl_length = 0; + ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; + memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); + ddt_log_update_header(ddt, ddl, tx); + + ddt_log_update_stats(ddt); +} + +boolean_t +ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) +{ + /* Swap the logs. The old flushing one must be empty */ + VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); + + /* + * If there are still blocks on the flushing log, truncate it first. + * This can happen if there were entries on the flushing log that were + * removed in memory via ddt_lookup(); their vestigal remains are + * on disk. + */ + if (ddt->ddt_log_flushing->ddl_length > 0) + ddt_log_truncate(ddt, tx); + + /* + * Swap policy. We swap the logs (and so begin flushing) when the + * active tree grows too large, or when we haven't swapped it in + * some amount of time. + */ + + /* + * The log tree is too large if the memory usage of its entries is over + * half of the memory limit. This effectively gives each log tree half + * the available memory. + */ + const boolean_t too_large = + (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * + DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); + + const boolean_t too_old = + tx->tx_txg >= + (ddt->ddt_log_active->ddl_first_txg + + MAX(1, zfs_dedup_log_txg_max)); + + if (!(too_large || too_old)) + return (B_FALSE); + + ddt_log_t *swap = ddt->ddt_log_active; + ddt->ddt_log_active = ddt->ddt_log_flushing; + ddt->ddt_log_flushing = swap; + + ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); + ddt->ddt_log_active->ddl_flags &= + ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); + + ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); + ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; + + ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; + + ddt_log_update_header(ddt, ddt->ddt_log_active, tx); + ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); + + ddt_log_update_stats(ddt); + + return (B_TRUE); +} + +static inline void +ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, + const ddt_key_t *checkpoint) +{ + ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); + + ddt_log_record_entry_t *dlre = + (ddt_log_record_entry_t *)dlr->dlr_payload; + if (checkpoint != NULL && + ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { + /* Skip pre-checkpoint entries; they're already flushed. */ + return; + } + + ddt_lightweight_entry_t ddlwe; + ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); + ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); + + ddlwe.ddlwe_key = dlre->dlre_key; + memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); + + ddt_log_update_entry(ddt, ddl, &ddlwe); +} + +static void +ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) +{ + void *cookie = NULL; + ddt_log_entry_t *ddle; + IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); + while ((ddle = + avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + } + ASSERT(avl_is_empty(&ddl->ddl_tree)); +} + +static int +ddt_log_load_one(ddt_t *ddt, uint_t n) +{ + ASSERT3U(n, <, 2); + + ddt_log_t *ddl = &ddt->ddt_log[n]; + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + uint64_t obj; + int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, + sizeof (uint64_t), 1, &obj); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); + + dnode_t *dn; + err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); + if (err != 0) + return (err); + + ddt_log_header_t hdr; + dmu_buf_t *db; + err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, FTAG); + return (err); + } + memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); + dmu_buf_rele(db, FTAG); + + if (DLH_GET_VERSION(&hdr) != 1) { + dnode_rele(dn, FTAG); + zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " + "unknown version=%llu", spa_name(ddt->ddt_spa), name, + (u_longlong_t)DLH_GET_VERSION(&hdr)); + return (SET_ERROR(EINVAL)); + } + + ddt_key_t *checkpoint = NULL; + if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { + /* + * If the log has a checkpoint, then we can ignore any entries + * that have already been flushed. + */ + ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); + checkpoint = &hdr.dlh_checkpoint; + } + + if (hdr.dlh_length > 0) { + dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, + ZIO_PRIORITY_SYNC_READ); + + for (uint64_t offset = 0; offset < hdr.dlh_length; + offset += dn->dn_datablksz) { + err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, + DMU_READ_PREFETCH); + if (err != 0) { + dnode_rele(dn, FTAG); + ddt_log_empty(ddt, ddl); + return (err); + } + + uint64_t boffset = 0; + while (boffset < db->db_size) { + ddt_log_record_t *dlr = + (ddt_log_record_t *)(db->db_data + boffset); + + /* Partially-filled block, skip the rest */ + if (DLR_GET_TYPE(dlr) == DLR_INVALID) + break; + + switch (DLR_GET_TYPE(dlr)) { + case DLR_ENTRY: + ddt_log_load_entry(ddt, ddl, dlr, + checkpoint); + break; + + default: + dmu_buf_rele(db, FTAG); + dnode_rele(dn, FTAG); + ddt_log_empty(ddt, ddl); + return (SET_ERROR(EINVAL)); + } + + boffset += DLR_GET_RECLEN(dlr); + } + + dmu_buf_rele(db, FTAG); + } + } + + dnode_rele(dn, FTAG); + + ddl->ddl_object = obj; + ddl->ddl_flags = DLH_GET_FLAGS(&hdr); + ddl->ddl_length = hdr.dlh_length; + ddl->ddl_first_txg = hdr.dlh_first_txg; + + if (ddl->ddl_flags & DDL_FLAG_FLUSHING) + ddt->ddt_log_flushing = ddl; + else + ddt->ddt_log_active = ddl; + + return (0); +} + +int +ddt_log_load(ddt_t *ddt) +{ + int err; + + if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { + /* + * The DDT is going to be freed again in a moment, so there's + * no point loading the log; it'll just slow down import. + */ + return (0); + } + + ASSERT0(ddt->ddt_log[0].ddl_object); + ASSERT0(ddt->ddt_log[1].ddl_object); + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then the log object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); + return (SET_ERROR(ENOENT)); + } + + if ((err = ddt_log_load_one(ddt, 0)) != 0) + return (err); + if ((err = ddt_log_load_one(ddt, 1)) != 0) + return (err); + + VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); + VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); + VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); + VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); + + /* + * We have two finalisation tasks: + * + * - rebuild the histogram. We do this at the end rather than while + * we're loading so we don't need to uncount and recount entries that + * appear multiple times in the log. + * + * - remove entries from the flushing tree that are on both trees. This + * happens when ddt_lookup() rehydrates an entry from the flushing + * tree, as ddt_log_take_key() removes the entry from the in-memory + * tree but doesn't remove it from disk. + */ + + /* + * We don't technically need a config lock here, since there shouldn't + * be pool config changes during DDT load. dva_get_dsize_sync() via + * ddt_stat_generate() is expecting it though, and it won't hurt + * anything, so we take it. + */ + spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); + + avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; + avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; + ddt_log_entry_t *ae = avl_first(al); + ddt_log_entry_t *fe = avl_first(fl); + while (ae != NULL || fe != NULL) { + ddt_log_entry_t *ddle; + if (ae == NULL) { + /* active exhausted, take flushing */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + } else if (fe == NULL) { + /* flushing exuhausted, take active */ + ddle = ae; + ae = AVL_NEXT(al, ae); + } else { + /* compare active and flushing */ + int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); + if (c < 0) { + /* active behind, take and advance */ + ddle = ae; + ae = AVL_NEXT(al, ae); + } else if (c > 0) { + /* flushing behind, take and advance */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + } else { + /* match. remove from flushing, take active */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + avl_remove(fl, ddle); + + ddle = ae; + ae = AVL_NEXT(al, ae); + } + } + + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + } + + spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); + + ddt_log_update_stats(ddt); + + return (0); +} + +void +ddt_log_alloc(ddt_t *ddt) +{ + ASSERT3P(ddt->ddt_log_active, ==, NULL); + ASSERT3P(ddt->ddt_log_flushing, ==, NULL); + + avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, + sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); + avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, + sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); + ddt->ddt_log_active = &ddt->ddt_log[0]; + ddt->ddt_log_flushing = &ddt->ddt_log[1]; + ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; +} + +void +ddt_log_free(ddt_t *ddt) +{ + ddt_log_empty(ddt, &ddt->ddt_log[0]); + ddt_log_empty(ddt, &ddt->ddt_log[1]); + avl_destroy(&ddt->ddt_log[0].ddl_tree); + avl_destroy(&ddt->ddt_log[1].ddl_tree); +} + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, + "Max transactions before starting to flush dedup logs"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, + "Max memory for dedup logs"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, + "Max memory for dedup logs, as % of total memory"); diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 9316200f2..8f55bc24f 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, memset(dds, 0, sizeof (*dds)); - for (int p = 0; p < ddlwe->ddlwe_nphys; p++) { + for (int p = 0; p < DDT_NPHYS(ddt); p++) { const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); @@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) ddo_total->ddo_mspace += ddo->ddo_mspace; } } + + ddt_object_t *ddo = &ddt->ddt_log_stats; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; } /* @@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) &ddt->ddt_histogram_cache[type][class]); } } + + ddt_histogram_add(ddh, &ddt->ddt_log_histogram); } } diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 3de316a12..96943421f 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special +DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh index 83c4d7c8e..4f6e5805b 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh @@ -29,9 +29,16 @@ log_assert "basic dedup (FDT) operations work" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh index f0f20671b..259eaddc0 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh @@ -29,9 +29,16 @@ log_assert "dedup (FDT) retains version after import" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh index 049ccaae3..114cf0266 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh @@ -30,9 +30,16 @@ log_assert "legacy and FDT dedup tables on the same pool can happily coexist" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh index d563fade8..c36463134 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh @@ -30,9 +30,16 @@ log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index 5b83a1ca3..326152b51 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -51,6 +51,12 @@ POOL="dedup_pool" save_tunable TXG_TIMEOUT +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { if poolexists $POOL ; then @@ -58,6 +64,7 @@ function cleanup fi log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR log_must restore_tunable TXG_TIMEOUT + log_must restore_tunable DEDUP_LOG_TXG_MAX } @@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit # # With no DDT quota in place, the above workload will produce over - # 800,000 entries by using space in the normal class. With a quota, - # it will be well below 500,000 entries. + # 800,000 entries by using space in the normal class. With a quota, it + # should be well under 500,000. However, logged entries are hard to + # account for because they can appear on both logs, and can also + # represent an eventual removal. This isn't easily visible from + # outside, and even internally can result in going slightly over quota. + # For here, we just set the entry count a little higher than what we + # expect to allow for some instability. # - log_must test $(ddt_entries) -le 500000 + log_must test $(ddt_entries) -le 600000 do_clean } |