diff options
-rw-r--r-- | cmd/zdb/zdb.c | 33 | ||||
-rw-r--r-- | include/sys/ddt.h | 39 | ||||
-rw-r--r-- | include/sys/ddt_impl.h | 131 | ||||
-rw-r--r-- | include/sys/dmu.h | 1 | ||||
-rw-r--r-- | lib/libzpool/Makefile.am | 1 | ||||
-rw-r--r-- | man/man4/zfs.4 | 82 | ||||
-rw-r--r-- | module/Kbuild.in | 1 | ||||
-rw-r--r-- | module/Makefile.bsd | 2 | ||||
-rw-r--r-- | module/zfs/ddt.c | 604 | ||||
-rw-r--r-- | module/zfs/ddt_log.c | 760 | ||||
-rw-r--r-- | module/zfs/ddt_stats.c | 9 | ||||
-rw-r--r-- | tests/zfs-tests/include/tunables.cfg | 1 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh | 7 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh | 18 |
17 files changed, 1600 insertions, 110 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 250052adf..c72df3909 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1959,6 +1959,32 @@ dump_dedup_ratio(const ddt_stat_t *dds) } static void +dump_ddt_log(ddt_t *ddt) +{ + for (int n = 0; n < 2; n++) { + ddt_log_t *ddl = &ddt->ddt_log[n]; + + uint64_t count = avl_numnodes(&ddl->ddl_tree); + if (count == 0) + continue; + + printf(DMU_POOL_DDT_LOG ": %lu log entries\n", + zio_checksum_table[ddt->ddt_checksum].ci_name, n, count); + + if (dump_opt['D'] < 4) + continue; + + ddt_lightweight_entry_t ddlwe; + uint64_t index = 0; + for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) { + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + dump_ddt_entry(ddt, &ddlwe, index++); + } + } +} + +static void dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; @@ -2027,6 +2053,7 @@ dump_all_ddts(spa_t *spa) dump_ddt(ddt, type, class); } } + dump_ddt_log(ddt); } ddt_get_dedup_stats(spa, &dds_total); @@ -5743,7 +5770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, (void *)(((uintptr_t)dde->dde_io) | (1 << v)); /* Consume a reference for this block. */ - VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0); + VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0); ddt_phys_decref(dde->dde_phys, v); /* @@ -8120,6 +8147,10 @@ dump_mos_leaks(spa_t *spa) /* FDT container */ mos_obj_refd(ddt->ddt_dir_object); + + /* FDT log objects */ + mos_obj_refd(ddt->ddt_log[0].ddl_object); + mos_obj_refd(ddt->ddt_log[1].ddl_object); } if (spa->spa_brt != NULL) { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 2dd18526d..2fc798725 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -43,7 +43,8 @@ struct abd; * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). */ #define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ -#define DDT_FLAG_MASK (DDT_FLAG_FLAT) +#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */ +#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG) /* * DDT on-disk storage object types. Each one corresponds to specific @@ -209,6 +210,7 @@ typedef enum { /* State flags for dde_flags */ #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ +#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ /* * Additional data to support entry update or repair. This is fixed size @@ -255,6 +257,19 @@ typedef struct { } ddt_lightweight_entry_t; /* + * In-core DDT log. A separate struct to make it easier to switch between the + * appending and flushing logs. + */ +typedef struct { + avl_tree_t ddl_tree; /* logged entries */ + uint32_t ddl_flags; /* flags for this log */ + uint64_t ddl_object; /* log object id */ + uint64_t ddl_length; /* on-disk log size */ + uint64_t ddl_first_txg; /* txg log became active */ + ddt_key_t ddl_checkpoint; /* last checkpoint */ +} ddt_log_t; + +/* * In-core DDT object. This covers all entries and stats for a the whole pool * for a given checksum type. */ @@ -262,8 +277,22 @@ typedef struct { kmutex_t ddt_lock; /* protects changes to all fields */ avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ + avl_tree_t ddt_log_tree; /* logged entries */ - avl_tree_t ddt_repair_tree; /* entries being repaired */ + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + ddt_log_t ddt_log[2]; /* active/flushing logs */ + ddt_log_t *ddt_log_active; /* pointers into ddt_log */ + ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ + + hrtime_t ddt_flush_start; /* log flush start this txg */ + uint32_t ddt_flush_pass; /* log flush pass this txg */ + + int32_t ddt_flush_count; /* entries flushed this txg */ + int32_t ddt_flush_min; /* min rem entries to flush */ + int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ + int32_t ddt_log_flush_rate; /* rolling log flush rate */ + int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ enum zio_checksum ddt_checksum; /* checksum algorithm in use */ spa_t *ddt_spa; /* pool this ddt is on */ @@ -276,13 +305,17 @@ typedef struct { /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; - /* object ids for whole-ddt and per-type/per-class stats */ + /* object ids for stored, logged and per-type/per-class stats */ uint64_t ddt_stat_object; + ddt_object_t ddt_log_stats; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + + /* log stats power-2-sized referenced blocks */ + ddt_histogram_t ddt_log_histogram; } ddt_t; /* diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index ce4bc559d..6f11cd90c 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -28,6 +28,7 @@ #define _SYS_DDT_IMPL_H #include <sys/ddt.h> +#include <sys/bitops.h> #ifdef __cplusplus extern "C" { @@ -50,6 +51,106 @@ extern "C" { memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ } while (0) +#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (ddle)->ddle_key; \ + (ddlwe)->ddlwe_type = (ddle)->ddle_type; \ + (ddlwe)->ddlwe_class = (ddle)->ddle_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \ +} while (0) + +/* + * An entry on the log tree. These are "frozen", and a record of what's in + * the on-disk log. They can't be used in place, but can be "loaded" back into + * the live tree. + */ +typedef struct { + ddt_key_t ddle_key; /* ddt_log_tree key */ + avl_node_t ddle_node; /* ddt_log_tree node */ + + ddt_type_t ddle_type; /* storage type */ + ddt_class_t ddle_class; /* storage class */ + + /* extra allocation for flat/trad phys */ + ddt_univ_phys_t ddle_phys[]; +} ddt_log_entry_t; + +/* On-disk log record types. */ +typedef enum { + DLR_INVALID = 0, /* end of block marker */ + DLR_ENTRY = 1, /* an entry to add or replace in the log tree */ +} ddt_log_record_type_t; + +/* On-disk log record header. */ +typedef struct { + /* + * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to + * access it. + * + * bits 0-7: record type (ddt_log_record_type_t) + * bits 8-15: length of record header+payload + * bits 16-47: reserved, all zero + * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type) + * otherwise all zero + * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class) + * otherwise all zero + */ + uint64_t dlr_info; + uint8_t dlr_payload[]; +} ddt_log_record_t; + +#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8) +#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v) +#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16) +#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v) +#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8) +#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v) +#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8) +#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v) + +/* Payload for DLR_ENTRY. */ +typedef struct { + ddt_key_t dlre_key; + ddt_univ_phys_t dlre_phys[]; +} ddt_log_record_entry_t; + +/* Log flags (ddl_flags, dlh_flags) */ +#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */ +#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */ + +/* On-disk log header, stored in the bonus buffer. */ +typedef struct { + /* + * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to + * access it. + * + * bits 0-7: log version + * bits 8-15: log flags + * bits 16-63: reserved, all zero + */ + uint64_t dlh_info; + + uint64_t dlh_length; /* log size in bytes */ + uint64_t dlh_first_txg; /* txg this log went active */ + ddt_key_t dlh_checkpoint; /* last checkpoint */ +} ddt_log_header_t; + +#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8) +#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v) +#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8) +#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v) + +/* DDT log update state */ +typedef struct { + dmu_tx_t *dlu_tx; /* tx the update is being applied to */ + dnode_t *dlu_dn; /* log object dnode */ + dmu_buf_t **dlu_dbp; /* array of block buffer pointers */ + int dlu_ndbp; /* number of block buffer pointers */ + uint16_t dlu_reclen; /* cached length of record */ + uint64_t dlu_block; /* block for next entry */ + uint64_t dlu_offset; /* offset for next entry */ +} ddt_log_update_t; + /* * Ops vector to access a specific DDT object type. */ @@ -77,6 +178,33 @@ typedef struct { extern const ddt_ops_t ddt_zap_ops; +/* Dedup log API */ +extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, + ddt_log_update_t *dlu); +extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde, + ddt_log_update_t *dlu); +extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); + +extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, + ddt_lightweight_entry_t *ddlwe); +extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, + const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe); + +extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, + dmu_tx_t *tx); +extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx); + +extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx); + +extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx); + +extern int ddt_log_load(ddt_t *ddt); +extern void ddt_log_alloc(ddt_t *ddt); +extern void ddt_log_free(ddt_t *ddt); + +extern void ddt_log_init(void); +extern void ddt_log_fini(void); + /* * These are only exposed so that zdb can access them. Try not to use them * outside of the DDT implementation proper, and if you do, consider moving @@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops; */ #define DDT_NAMELEN 32 -extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde); +extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, + const ddt_univ_phys_t *ddp); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 5b80dc315..928f5f2b4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -375,6 +375,7 @@ typedef struct dmu_buf { #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_LOG "DDT-log-%s-%u" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_DDT_DIR "DDT-%s" #define DMU_POOL_CREATION_VERSION "creation_version" diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 42f3404db..070dc0132 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/dbuf.c \ module/zfs/dbuf_stats.c \ module/zfs/ddt.c \ + module/zfs/ddt_log.c \ module/zfs/ddt_stats.c \ module/zfs/ddt_zap.c \ module/zfs/dmu.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 45b6c338a..aae3d7dfb 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -974,6 +974,88 @@ milliseconds until the operation completes. .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . +.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint +Maximum number of dedup log flush passes (iterations) each transaction. +.Pp +At the start of each transaction, OpenZFS will estimate how many entries it +needs to flush out to keep up with the change rate, taking the amount and time +taken to flush on previous txgs into account (see +.Sy zfs_dedup_log_flush_flow_rate_txgs ) . +It will spread this amount into a number of passes. +At each pass, it will use the amount already flushed and the total time taken +by flushing and by other IO to recompute how much it should do for the remainder +of the txg. +.Pp +Reducing the max number of passes will make flushing more aggressive, flushing +out more entries on each pass. +This can be faster, but also more likely to compete with other IO. +Increasing the max number of passes will put fewer entries onto each pass, +keeping the overhead of dedup changes to a minimum but possibly causing a large +number of changes to be dumped on the last pass, which can blow out the txg +sync time beyond +.Sy zfs_txg_timeout . +. +.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint +Minimum time to spend on dedup log flush each transaction. +.Pp +At least this long will be spent flushing dedup log entries each transaction, +up to +.Sy zfs_txg_timeout . +This occurs even if doing so would delay the transaction, that is, other IO +completes under this time. +. +.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint +Flush at least this many entries each transaction. +.Pp +OpenZFS will estimate how many entries it needs to flush each transaction to +keep up with the ingest rate (see +.Sy zfs_dedup_log_flush_flow_rate_txgs ) . +This sets the minimum for that estimate. +Raising it can force OpenZFS to flush more aggressively, keeping the log small +and so reducing pool import times, but can make it less able to back off if +log flushing would compete with other IO too much. +. +.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint +Number of transactions to use to compute the flow rate. +.Pp +OpenZFS will estimate how many entries it needs to flush each transaction by +monitoring the number of entries changed (ingest rate), number of entries +flushed (flush rate) and time spent flushing (flush time rate) and combining +these into an overall "flow rate". +It will use an exponential weighted moving average over some number of recent +transactions to compute these rates. +This sets the number of transactions to compute these averages over. +Setting it higher can help to smooth out the flow rate in the face of spiky +workloads, but will take longer for the flow rate to adjust to a sustained +change in the ingress rate. +. +.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint +Max transactions to before starting to flush dedup logs. +.Pp +OpenZFS maintains two dedup logs, one receiving new changes, one flushing. +If there is nothing to flush, it will accumulate changes for no more than this +many transactions before switching the logs and starting to flush entries out. +. +.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64 +Max memory to use for dedup logs. +.Pp +OpenZFS will spend no more than this much memory on maintaining the in-memory +dedup log. +Flushing will begin when around half this amount is being spent on logs. +The default value of +.Sy 0 +will cause it to be set by +.Sy zfs_dedup_log_mem_max_percent +instead. +. +.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint +Max memory to use for dedup logs, as a percentage of total memory. +.Pp +If +.Sy zfs_dedup_log_mem_max +is not set, it will be initialised as a percentage of the total memory in the +system. +. .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of diff --git a/module/Kbuild.in b/module/Kbuild.in index 57682214d..a119198db 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -322,6 +322,7 @@ ZFS_OBJS := \ dbuf.o \ dbuf_stats.o \ ddt.o \ + ddt_log.o \ ddt_stats.o \ ddt_zap.o \ dmu.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index d9d31564d..534f32571 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -252,6 +252,7 @@ SRCS+= abd.c \ dbuf.c \ dbuf_stats.c \ ddt.c \ + ddt_log.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ @@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast CFLAGS.abd.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.ddt_zap.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 26e127d61..ce5c4efb5 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -125,6 +125,28 @@ * without which, no space would be recovered and the DDT would continue to be * considered "over quota". See zap_shrink_enabled. * + * ## Dedup log + * + * Historically, all entries modified on a txg were written back to dedup + * storage objects at the end of every txg. This could cause significant + * overheads, as each entry only takes up a tiny portion of a ZAP leaf node, + * and so required reading the whole node, updating the entry, and writing it + * back. On busy pools, this could add serious IO and memory overheads. + * + * To address this, the dedup log was added. If the "fast_dedup" feature is + * enabled, at the end of each txg, modified entries will be copied to an + * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the + * same block is requested again, the in-memory object will be checked first, + * and if its there, the entry inflated back onto the live tree without going + * to storage. The on-disk log is only read at pool import time, to reload the + * in-memory log. + * + * Each txg, some amount of the in-memory log will be flushed out to a DDT + * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to + * keep up with the rate of change on dedup entries, but not so much that it + * would impact overall throughput, and not using too much memory. See the + * zfs_dedup_log_* tuneables in zfs(4) for more details. + * * ## Repair IO * * If a read on a dedup block fails, but there are other copies of the block in @@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0; uint_t dedup_class_wait_txgs = 5; +/* + * Don't do more than this many incremental flush passes per txg. + */ +uint_t zfs_dedup_log_flush_passes_max = 8; + +/* + * Minimum time to flush per txg. + */ +uint_t zfs_dedup_log_flush_min_time_ms = 1000; + +/* + * Minimum entries to flush per txg. + */ +uint_t zfs_dedup_log_flush_entries_min = 1000; + +/* + * Number of txgs to average flow rates across. + */ +uint_t zfs_dedup_log_flush_flow_rate_txgs = 10; + static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; @@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = { */ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_LEGACY] = 0, - [DDT_VERSION_FDT] = DDT_FLAG_FLAT, + [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, }; /* Dummy version to signal that configure is still necessary */ @@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) static int ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - ddt_entry_t *dde, dmu_tx_t *tx) + const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], &dde->dde_key, - dde->dde_phys, DDT_PHYS_SIZE(ddt), tx)); + ddt->ddt_object[type][class], &ddlwe->ddlwe_key, + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx)); } static int @@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) } uint64_t -ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde) +ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp) { uint64_t refcnt = 0; - if (ddt->ddt_flags & DDT_FLAG_FLAT) { - refcnt = dde->dde_phys->ddp_flat.ddp_refcnt; - } else { - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) - refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt; - } + if (ddt->ddt_flags & DDT_FLAG_FLAT) + refcnt = ddp->ddp_flat.ddp_refcnt; + else + for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++) + refcnt += ddp->ddp_trad[v].ddp_refcnt; return (refcnt); } @@ -743,11 +784,15 @@ ddt_init(void) DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + + ddt_log_init(); } void ddt_fini(void) { + ddt_log_fini(); + kmem_cache_destroy(ddt_entry_trad_cache); kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); @@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) { ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + /* Entry is still in the log, so charge the entry back to it */ + if (dde->dde_flags & DDE_FLAG_LOGGED) { + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + } + avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); } @@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) avl_insert(&ddt->ddt_tree, dde, where); + /* If its in the log tree, we can "load" it from there */ + if (ddt->ddt_flags & DDT_FLAG_LOG) { + ddt_lightweight_entry_t ddlwe; + + if (ddt_log_take_key(ddt, ddt->ddt_log_active, + &search, &ddlwe) || + ddt_log_take_key(ddt, ddt->ddt_log_flushing, + &search, &ddlwe)) { + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + + dde->dde_type = ddlwe.ddlwe_type; + dde->dde_class = ddlwe.ddlwe_class; + memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, + DDT_PHYS_SIZE(ddt)); + + return (dde); + } + } + /* * ddt_tree is now stable, so unlock and let everyone else keep moving. * Anyone landing on this entry will find it without DDE_FLAG_LOADED, @@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { /* - * The histograms only track inactive (stored) blocks. + * The histograms only track inactive (stored or logged) blocks. * We've just put an entry onto the live list, so we need to * remove its counts. When its synced back, it'll be re-added * to the right one. + * + * We only do this when we successfully found it in the store. + * error == ENOENT means this is a new entry, and so its already + * not counted. */ ddt_histogram_t *ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; @@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) } } + ddt_log_destroy(ddt, tx); + uint64_t count; ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, @@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); memset(ddt, 0, sizeof (ddt_t)); - mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); avl_create(&ddt->ddt_repair_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt_log_alloc(ddt); + return (ddt); } static void ddt_table_free(ddt_t *ddt) { + ddt_log_free(ddt); ASSERT0(avl_numnodes(&ddt->ddt_tree)); ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); avl_destroy(&ddt->ddt_tree); @@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa) } } + error = ddt_log_load(ddt); + if (error != 0 && error != ENOENT) + return (error); + /* * Seed the cached histograms. */ @@ -1483,145 +1567,447 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) } static void -ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx) +{ + /* + * Count all the entries stored for each type/class, and updates the + * stats within (ddt_object_sync()). If there's no entries for the + * type/class, the whole object is removed. If all objects for the DDT + * are removed, its containing dir is removed, effectively resetting + * the entire DDT to an empty slate. + */ + uint64_t count = 0; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + uint64_t add, tcount = 0; + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + VERIFY0(ddt_object_count(ddt, type, class, + &add)); + tcount += add; + } + } + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + if (tcount == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + count += tcount; + } + + if (ddt->ddt_flags & DDT_FLAG_LOG) { + /* Include logged entries in the total count */ + count += avl_numnodes(&ddt->ddt_log_active->ddl_tree); + count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + } + + if (count == 0) { + /* + * No entries left on the DDT, so reset the version for next + * time. This allows us to handle the feature being changed + * since the DDT was originally created. New entries should get + * whatever the feature currently demands. + */ + if (ddt->ddt_version == DDT_VERSION_FDT) + ddt_destroy_dir(ddt, tx); + + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_flags = 0; + } + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; +} + +static void +ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; - ddt_key_t *ddk = &dde->dde_key; - ddt_type_t otype = dde->dde_type; - ddt_type_t ntype = DDT_TYPE_DEFAULT; - ddt_class_t oclass = dde->dde_class; - ddt_class_t nclass; - uint64_t total_refcnt = 0; - ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Compute the target class, so we can decide whether or not to inform + * the scrub traversal (below). Note that we don't store this in the + * entry, as it might change multiple times before finally being + * committed (if we're logging). Instead, we recompute it in + * ddt_sync_entry(). + */ + uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys); + ddt_class_t nclass = + (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + + /* + * If the class changes, the order that we scan this bp changes. If it + * decreases, we could miss it, so scan it right now. (This covers both + * class changing while we are doing ddt_walk(), and when we are + * traversing.) + * + * We also do this when the refcnt goes to zero, because that change is + * only in the log so far; the blocks on disk won't be freed until + * the log is flushed, and the refcnt might increase before that. If it + * does, then we could miss it in the same way. + */ + if (refcnt == 0 || nclass < ddlwe->ddlwe_class) + dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt, + ddlwe, tx); +} + +static void +ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, + ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx) +{ + ddt_key_t *ddk = &ddlwe->ddlwe_key; + ddt_type_t ntype = DDT_TYPE_DEFAULT; + uint64_t refcnt = 0; + /* + * Compute the total refcnt. Along the way, issue frees for any DVAs + * we no longer want. + */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ASSERT(dde->dde_io == NULL || - dde->dde_io->dde_lead_zio[p] == NULL); - ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); if (ddt_phys_birth(ddp, v) == 0) { - ASSERT0(phys_refcnt); + ASSERT3U(phys_refcnt, ==, 0); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { /* - * Note, we no longer create DDT-DITTO blocks, but we - * don't want to leak any written by older software. + * We don't want to keep any obsolete slots (eg ditto), + * regardless of their refcount, but we don't want to + * leak them either. So, free them. */ - ddt_phys_free(ddt, ddk, ddp, v, txg); + ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); continue; } if (phys_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, v, txg); - total_refcnt += phys_refcnt; + /* No remaining references, free it! */ + ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); + refcnt += phys_refcnt; } - if (total_refcnt > 1) - nclass = DDT_CLASS_DUPLICATE; - else - nclass = DDT_CLASS_UNIQUE; + /* Select the best class for the entry. */ + ddt_class_t nclass = + (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + /* + * If an existing entry changed type or class, or its refcount reached + * zero, delete it from the DDT object + */ if (otype != DDT_TYPES && - (otype != ntype || oclass != nclass || total_refcnt == 0)) { + (otype != ntype || oclass != nclass || refcnt == 0)) { VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); - ASSERT3U( - ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); + ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT); } - if (total_refcnt != 0) { - dde->dde_type = ntype; - dde->dde_class = nclass; + /* + * Add or update the entry + */ + if (refcnt != 0) { + ddt_histogram_t *ddh = + &ddt->ddt_histogram[ntype][nclass]; + + ddt_histogram_add_entry(ddt, ddh, ddlwe); if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); - VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); + VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx)); + } +} - ddt_lightweight_entry_t ddlwe; - DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); +/* Calculate an exponential weighted moving average, lower limited to zero */ +static inline int32_t +_ewma(int32_t val, int32_t prev, uint32_t weight) +{ + ASSERT3U(val, >=, 0); + ASSERT3U(prev, >=, 0); + const int32_t new = + MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1)); + ASSERT3U(new, >=, 0); + return (new); +} - ddt_histogram_t *ddh = - &ddt->ddt_histogram[ntype][nclass]; - ddt_histogram_add_entry(ddt, ddh, &ddlwe); +/* Returns true if done for this txg */ +static boolean_t +ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) +{ + if (ddt->ddt_flush_pass == 0) { + if (spa_sync_pass(ddt->ddt_spa) == 1) { + /* First run this txg, get set up */ + ddt->ddt_flush_start = gethrtime(); + ddt->ddt_flush_count = 0; + /* + * How many entries we need to flush. We want to at + * least match the ingest rate. + */ + ddt->ddt_flush_min = MAX( + ddt->ddt_log_ingest_rate, + zfs_dedup_log_flush_entries_min); + } else { + /* We already decided we're done for this txg */ + return (B_FALSE); + } + } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { /* - * If the class changes, the order that we scan this bp - * changes. If it decreases, we could miss it, so - * scan it right now. (This covers both class changing - * while we are doing ddt_walk(), and when we are - * traversing.) + * We already did some flushing on this pass, skip it. This + * happens when dsl_process_async_destroys() runs during a scan + * (on pass 1) and does an additional ddt_sync() to update + * freed blocks. */ - if (nclass < oclass) { - dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, ddt, &ddlwe, tx); - } + return (B_FALSE); } + + if (spa_sync_pass(ddt->ddt_spa) > + MAX(zfs_dedup_log_flush_passes_max, 1)) { + /* Too many passes this txg, defer until next. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* Nothing to flush, done for this txg. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? + MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), + SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); + + uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; + + if (elapsed_time >= target_time) { + /* Too long since we started, done for this txg. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + ddt->ddt_flush_pass++; + ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); + + /* + * Estimate how much time we'll need to flush the remaining entries + * based on how long it normally takes. + */ + uint32_t want_time; + if (ddt->ddt_flush_pass == 1) { + /* First pass, use the average time/entries */ + if (ddt->ddt_log_flush_rate == 0) + /* Zero rate, just assume the whole time */ + want_time = target_time; + else + want_time = ddt->ddt_flush_min * + ddt->ddt_log_flush_time_rate / + ddt->ddt_log_flush_rate; + } else { + /* Later pass, calculate from this txg so far */ + want_time = ddt->ddt_flush_min * + elapsed_time / ddt->ddt_flush_count; + } + + /* Figure out how much time we have left */ + uint32_t remain_time = target_time - elapsed_time; + + /* Smear the remaining entries over the remaining passes. */ + uint32_t nentries = ddt->ddt_flush_min / + (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); + if (want_time > remain_time) { + /* + * We're behind; try to catch up a bit by doubling the amount + * this pass. If we're behind that means we're in a later + * pass and likely have most of the remaining time to + * ourselves. If we're in the last couple of passes, then + * doubling might just take us over the timeout, but probably + * not be much, and it stops us falling behind. If we're + * in the middle passes, there'll be more to do, but it + * might just help us catch up a bit and we'll recalculate on + * the next pass anyway. + */ + nentries = MIN(ddt->ddt_flush_min, nentries*2); + } + + ddt_lightweight_entry_t ddlwe; + uint32_t count = 0; + while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { + ddt_sync_flush_entry(ddt, &ddlwe, + ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); + + /* End this pass if we've synced as much as we need to. */ + if (++count >= nentries) + break; + } + ddt->ddt_flush_count += count; + ddt->ddt_flush_min -= count; + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* We emptied it, so truncate on-disk */ + ddt_log_truncate(ddt, tx); + /* No more passes needed this txg */ + ddt->ddt_flush_pass = 0; + } else + /* More to do next time, save checkpoint */ + ddt_log_checkpoint(ddt, &ddlwe, tx); + + ddt_sync_update_stats(ddt, tx); + + return (ddt->ddt_flush_pass == 0); } static void -ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) { - spa_t *spa = ddt->ddt_spa; - ddt_entry_t *dde; - void *cookie = NULL; + ASSERT(avl_is_empty(&ddt->ddt_tree)); - if (avl_numnodes(&ddt->ddt_tree) == 0) + /* Don't do any flushing when the pool is ready to shut down */ + if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) return; - ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); + /* Try to flush some. */ + if (!ddt_sync_flush_log_incremental(ddt, tx)) + /* More to do next time */ + return; - if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, tx); + /* No more flushing this txg, so we can do end-of-txg housekeeping */ + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && + !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { + /* + * No more to flush, and the active list has stuff, so + * try to swap the logs for next time. + */ + (void) ddt_log_swap(ddt, tx); } - if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) - ddt_create_dir(ddt, tx); + /* + * Update flush rate. This is an exponential weighted moving average of + * the number of entries flushed over recent txgs. + */ + ddt->ddt_log_flush_rate = _ewma( + ddt->ddt_flush_count, ddt->ddt_log_flush_rate, + zfs_dedup_log_flush_flow_rate_txgs); - while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { - ddt_sync_entry(ddt, dde, tx, txg); - ddt_free(ddt, dde); - } + /* + * Update flush time rate. This is an exponential weighted moving + * average of the total time taken to flush over recent txgs. + */ + ddt->ddt_log_flush_time_rate = _ewma( + ddt->ddt_log_flush_time_rate, + ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), + zfs_dedup_log_flush_flow_rate_txgs); +} - uint64_t count = 0; - for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - uint64_t add, tcount = 0; - for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (ddt_object_exists(ddt, type, class)) { - ddt_object_sync(ddt, type, class, tx); - VERIFY0(ddt_object_count(ddt, type, class, - &add)); - tcount += add; - } +static void +ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx) +{ + uint64_t count = avl_numnodes(&ddt->ddt_tree); + + if (count > 0) { + ddt_log_update_t dlu = {0}; + ddt_log_begin(ddt, count, tx, &dlu); + + ddt_entry_t *dde; + void *cookie = NULL; + ddt_lightweight_entry_t ddlwe; + while ((dde = + avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_log_entry(ddt, &ddlwe, &dlu); + ddt_sync_scan_entry(ddt, &ddlwe, tx); + ddt_free(ddt, dde); } - for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (tcount == 0 && ddt_object_exists(ddt, type, class)) - ddt_object_destroy(ddt, type, class, tx); + + ddt_log_commit(ddt, &dlu); + + /* + * Sync the stats for the store objects. Even though we haven't + * modified anything on those objects, they're no longer the + * source of truth for entries that are now in the log, and we + * need the on-disk counts to reflect that, otherwise we'll + * miscount later when importing. + */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; + class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) + ddt_object_sync(ddt, type, class, tx); + } } - count += tcount; + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; } - if (count == 0) { + if (spa_sync_pass(ddt->ddt_spa) == 1) /* - * No entries left on the DDT, so reset the version for next - * time. This allows us to handle the feature being changed - * since the DDT was originally created. New entries should get - * whatever the feature currently demands. + * Update ingest rate. This is an exponential weighted moving + * average of the number of entries changed over recent txgs. + * The ramp-up cost shouldn't matter too much because the + * flusher will be trying to take at least the minimum anyway. */ - if (ddt->ddt_version == DDT_VERSION_FDT) - ddt_destroy_dir(ddt, tx); + ddt->ddt_log_ingest_rate = _ewma( + count, ddt->ddt_log_ingest_rate, + zfs_dedup_log_flush_flow_rate_txgs); +} - ddt->ddt_version = DDT_VERSION_UNCONFIGURED; - ddt->ddt_flags = 0; +static void +ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx) +{ + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ddt_entry_t *dde; + void *cookie = NULL; + while ((dde = avl_destroy_nodes( + &ddt->ddt_tree, &cookie)) != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_sync_flush_entry(ddt, &ddlwe, + dde->dde_type, dde->dde_class, tx); + ddt_sync_scan_entry(ddt, &ddlwe, tx); + ddt_free(ddt, dde); } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); - spa->spa_dedup_dspace = ~0ULL; - spa->spa_dedup_dsize = ~0ULL; + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; + ddt_sync_update_stats(ddt, tx); +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + + if (ddt->ddt_version == UINT64_MAX) + return; + + if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) { + ASSERT0(avl_numnodes(&ddt->ddt_tree)); + return; + } + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); + } + + if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) + ddt_create_dir(ddt, tx); + + if (ddt->ddt_flags & DDT_FLAG_LOG) + ddt_sync_table_log(ddt, tx); + else + ddt_sync_table_flush(ddt, tx); } void @@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg) ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) continue; - ddt_sync_table(ddt, tx, txg); + ddt_sync_table(ddt, tx); + if (ddt->ddt_flags & DDT_FLAG_LOG) + ddt_sync_flush_log(ddt, tx); ddt_repair_table(ddt, rio); } @@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (B_FALSE); } - if (dde->dde_type < DDT_TYPES) { - ASSERT3S(dde->dde_class, <, DDT_CLASSES); - + if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) { + /* + * This entry was either synced to a store object (dde_type is + * real) or was logged. It must be properly on disk at this + * point, so we can just bump its refcount. + */ int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); @@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) * we may have a block with the DEDUP set, but which doesn't * have a corresponding entry in the DDT. Be ready. */ - ASSERT3S(dde->dde_class, ==, DDT_CLASSES); ddt_remove(ddt, dde); result = B_FALSE; } @@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, + "Max number of incremental dedup log flush passes per transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, + "Min time to spend on incremental dedup log flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, + "Min number of log entries to flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, + "Number of txgs to average flow rates across"); diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c new file mode 100644 index 000000000..7e7ff9e5b --- /dev/null +++ b/module/zfs/ddt_log.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023, Klara Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/ddt.h> +#include <sys/dmu_tx.h> +#include <sys/dmu.h> +#include <sys/ddt_impl.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> + +/* + * No more than this many txgs before swapping logs. + */ +uint_t zfs_dedup_log_txg_max = 8; + +/* + * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module + * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. + */ +uint64_t zfs_dedup_log_mem_max = 0; +uint_t zfs_dedup_log_mem_max_percent = 1; + + +static kmem_cache_t *ddt_log_entry_flat_cache; +static kmem_cache_t *ddt_log_entry_trad_cache; + +#define DDT_LOG_ENTRY_FLAT_SIZE \ + (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_LOG_ENTRY_TRAD_SIZE \ + (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_LOG_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) + +void +ddt_log_init(void) +{ + ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", + DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", + DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * Max memory for log AVL entries. At least 1M, because we need + * something (that's ~3800 entries per tree). They can say 100% if they + * want; it just means they're at the mercy of the the txg flush limit. + */ + if (zfs_dedup_log_mem_max == 0) { + zfs_dedup_log_mem_max_percent = + MIN(zfs_dedup_log_mem_max_percent, 100); + zfs_dedup_log_mem_max = (physmem * PAGESIZE) * + zfs_dedup_log_mem_max_percent / 100; + } + zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); +} + +void +ddt_log_fini(void) +{ + kmem_cache_destroy(ddt_log_entry_trad_cache); + kmem_cache_destroy(ddt_log_entry_flat_cache); +} + +static void +ddt_log_name(ddt_t *ddt, char *name, uint_t n) +{ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, + zio_checksum_table[ddt->ddt_checksum].ci_name, n); +} + +static void +ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) +{ + dmu_buf_t *db; + VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; + DLH_SET_VERSION(hdr, 1); + DLH_SET_FLAGS(hdr, ddl->ddl_flags); + hdr->dlh_length = ddl->ddl_length; + hdr->dlh_first_txg = ddl->ddl_first_txg; + hdr->dlh_checkpoint = ddl->ddl_checkpoint; + + dmu_buf_rele(db, FTAG); +} + +static void +ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, >, 0); + ASSERT3U(ddl->ddl_object, ==, 0); + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, + sizeof (uint64_t), 1, &ddl->ddl_object, tx)); + ddl->ddl_length = 0; + ddl->ddl_first_txg = tx->tx_txg; + ddt_log_update_header(ddt, ddl, tx); +} + +static void +ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); + ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); +} + +static void +ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, >, 0); + + if (ddl->ddl_object == 0) + return; + + ASSERT0(ddl->ddl_length); + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); + VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); + + ddl->ddl_object = 0; +} + +void +ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); + ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); +} + +static void +ddt_log_update_stats(ddt_t *ddt) +{ + /* + * Log object stats. We count the number of live entries in the log + * tree, even if there are more than on disk, and even if the same + * entry is on both append and flush trees, because that's more what + * the user expects to see. This does mean the on-disk size is not + * really correlated with the number of entries, but I don't think + * that's reasonable to expect anyway. + */ + dmu_object_info_t doi; + uint64_t nblocks; + dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); + nblocks = doi.doi_physical_blocks_512; + dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); + nblocks += doi.doi_physical_blocks_512; + + ddt_object_t *ddo = &ddt->ddt_log_stats; + ddo->ddo_count = + avl_numnodes(&ddt->ddt_log_active->ddl_tree) + + avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); + ddo->ddo_dspace = nblocks << 9; +} + +void +ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) +{ + ASSERT3U(nentries, >, 0); + ASSERT3P(dlu->dlu_dbp, ==, NULL); + + if (ddt->ddt_log_active->ddl_object == 0) + ddt_log_create(ddt, tx); + + /* + * We want to store as many entries as we can in a block, but never + * split an entry across block boundaries. + */ + size_t reclen = P2ALIGN_TYPED( + sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + + DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); + ASSERT3U(reclen, <=, UINT16_MAX); + dlu->dlu_reclen = reclen; + + VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, + &dlu->dlu_dn)); + dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); + + uint64_t nblocks = howmany(nentries, + dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); + uint64_t offset = ddt->ddt_log_active->ddl_length; + uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; + + VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, + B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, + DMU_READ_NO_PREFETCH)); + + dlu->dlu_tx = tx; + dlu->dlu_block = dlu->dlu_offset = 0; +} + +static ddt_log_entry_t * +ddt_log_alloc_entry(ddt_t *ddt) +{ + ddt_log_entry_t *ddle; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); + memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); + } else { + ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); + memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); + } + + return (ddle); +} + +static void +ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +{ + /* Create the log tree entry from a live or stored entry */ + avl_index_t where; + ddt_log_entry_t *ddle = + avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); + if (ddle == NULL) { + ddle = ddt_log_alloc_entry(ddt); + ddle->ddle_key = ddlwe->ddlwe_key; + avl_insert(&ddl->ddl_tree, ddle, where); + } + ddle->ddle_type = ddlwe->ddlwe_type; + ddle->ddle_class = ddlwe->ddlwe_class; + memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); +} + +void +ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) +{ + ASSERT3U(dlu->dlu_dbp, !=, NULL); + + ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + /* Get our block */ + ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); + dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; + + /* + * If this would take us past the end of the block, finish it and + * move to the next one. + */ + if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { + ASSERT3U(dlu->dlu_offset, >, 0); + dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); + dlu->dlu_block++; + dlu->dlu_offset = 0; + ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); + db = dlu->dlu_dbp[dlu->dlu_block]; + } + + /* + * If this is the first time touching the block, inform the DMU that + * we will fill it, and zero it out. + */ + if (dlu->dlu_offset == 0) { + dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE); + memset(db->db_data, 0, db->db_size); + } + + /* Create the log record directly in the buffer */ + ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); + DLR_SET_TYPE(dlr, DLR_ENTRY); + DLR_SET_RECLEN(dlr, dlu->dlu_reclen); + DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); + DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); + + ddt_log_record_entry_t *dlre = + (ddt_log_record_entry_t *)&dlr->dlr_payload; + dlre->dlre_key = ddlwe->ddlwe_key; + memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); + + /* Advance offset for next record. */ + dlu->dlu_offset += dlu->dlu_reclen; +} + +void +ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) +{ + ASSERT3U(dlu->dlu_dbp, !=, NULL); + ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); + ASSERT3U(dlu->dlu_offset, >, 0); + + /* + * Close out the last block. Whatever we haven't used will be zeroed, + * which matches DLR_INVALID, so we can detect this during load. + */ + dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); + + dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); + + ddt->ddt_log_active->ddl_length += + dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; + dnode_rele(dlu->dlu_dn, FTAG); + + ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); + + memset(dlu, 0, sizeof (ddt_log_update_t)); + + ddt_log_update_stats(ddt); +} + +boolean_t +ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + if (ddle == NULL) + return (B_FALSE); + + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + avl_remove(&ddl->ddl_tree, ddle); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + + return (B_TRUE); +} + +boolean_t +ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); + if (ddle == NULL) + return (B_FALSE); + + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + avl_remove(&ddl->ddl_tree, ddle); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + + return (B_TRUE); +} + +void +ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) +{ + ddt_log_t *ddl = ddt->ddt_log_flushing; + + ASSERT3U(ddl->ddl_object, !=, 0); + +#ifdef ZFS_DEBUG + /* + * There should not be any entries on the log tree before the given + * checkpoint. Assert that this is the case. + */ + ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + if (ddle != NULL) + VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), + >, 0); +#endif + + ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; + ddl->ddl_checkpoint = ddlwe->ddlwe_key; + ddt_log_update_header(ddt, ddl, tx); + + ddt_log_update_stats(ddt); +} + +void +ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_t *ddl = ddt->ddt_log_flushing; + + if (ddl->ddl_object == 0) + return; + + ASSERT(avl_is_empty(&ddl->ddl_tree)); + + /* Eject the entire object */ + dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); + + ddl->ddl_length = 0; + ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; + memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); + ddt_log_update_header(ddt, ddl, tx); + + ddt_log_update_stats(ddt); +} + +boolean_t +ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) +{ + /* Swap the logs. The old flushing one must be empty */ + VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); + + /* + * If there are still blocks on the flushing log, truncate it first. + * This can happen if there were entries on the flushing log that were + * removed in memory via ddt_lookup(); their vestigal remains are + * on disk. + */ + if (ddt->ddt_log_flushing->ddl_length > 0) + ddt_log_truncate(ddt, tx); + + /* + * Swap policy. We swap the logs (and so begin flushing) when the + * active tree grows too large, or when we haven't swapped it in + * some amount of time. + */ + + /* + * The log tree is too large if the memory usage of its entries is over + * half of the memory limit. This effectively gives each log tree half + * the available memory. + */ + const boolean_t too_large = + (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * + DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); + + const boolean_t too_old = + tx->tx_txg >= + (ddt->ddt_log_active->ddl_first_txg + + MAX(1, zfs_dedup_log_txg_max)); + + if (!(too_large || too_old)) + return (B_FALSE); + + ddt_log_t *swap = ddt->ddt_log_active; + ddt->ddt_log_active = ddt->ddt_log_flushing; + ddt->ddt_log_flushing = swap; + + ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); + ddt->ddt_log_active->ddl_flags &= + ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); + + ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); + ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; + + ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; + + ddt_log_update_header(ddt, ddt->ddt_log_active, tx); + ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); + + ddt_log_update_stats(ddt); + + return (B_TRUE); +} + +static inline void +ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, + const ddt_key_t *checkpoint) +{ + ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); + + ddt_log_record_entry_t *dlre = + (ddt_log_record_entry_t *)dlr->dlr_payload; + if (checkpoint != NULL && + ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { + /* Skip pre-checkpoint entries; they're already flushed. */ + return; + } + + ddt_lightweight_entry_t ddlwe; + ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); + ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); + + ddlwe.ddlwe_key = dlre->dlre_key; + memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); + + ddt_log_update_entry(ddt, ddl, &ddlwe); +} + +static void +ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) +{ + void *cookie = NULL; + ddt_log_entry_t *ddle; + IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); + while ((ddle = + avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + } + ASSERT(avl_is_empty(&ddl->ddl_tree)); +} + +static int +ddt_log_load_one(ddt_t *ddt, uint_t n) +{ + ASSERT3U(n, <, 2); + + ddt_log_t *ddl = &ddt->ddt_log[n]; + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + uint64_t obj; + int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, + sizeof (uint64_t), 1, &obj); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); + + dnode_t *dn; + err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); + if (err != 0) + return (err); + + ddt_log_header_t hdr; + dmu_buf_t *db; + err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, FTAG); + return (err); + } + memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); + dmu_buf_rele(db, FTAG); + + if (DLH_GET_VERSION(&hdr) != 1) { + dnode_rele(dn, FTAG); + zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " + "unknown version=%llu", spa_name(ddt->ddt_spa), name, + (u_longlong_t)DLH_GET_VERSION(&hdr)); + return (SET_ERROR(EINVAL)); + } + + ddt_key_t *checkpoint = NULL; + if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { + /* + * If the log has a checkpoint, then we can ignore any entries + * that have already been flushed. + */ + ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); + checkpoint = &hdr.dlh_checkpoint; + } + + if (hdr.dlh_length > 0) { + dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, + ZIO_PRIORITY_SYNC_READ); + + for (uint64_t offset = 0; offset < hdr.dlh_length; + offset += dn->dn_datablksz) { + err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, + DMU_READ_PREFETCH); + if (err != 0) { + dnode_rele(dn, FTAG); + ddt_log_empty(ddt, ddl); + return (err); + } + + uint64_t boffset = 0; + while (boffset < db->db_size) { + ddt_log_record_t *dlr = + (ddt_log_record_t *)(db->db_data + boffset); + + /* Partially-filled block, skip the rest */ + if (DLR_GET_TYPE(dlr) == DLR_INVALID) + break; + + switch (DLR_GET_TYPE(dlr)) { + case DLR_ENTRY: + ddt_log_load_entry(ddt, ddl, dlr, + checkpoint); + break; + + default: + dmu_buf_rele(db, FTAG); + dnode_rele(dn, FTAG); + ddt_log_empty(ddt, ddl); + return (SET_ERROR(EINVAL)); + } + + boffset += DLR_GET_RECLEN(dlr); + } + + dmu_buf_rele(db, FTAG); + } + } + + dnode_rele(dn, FTAG); + + ddl->ddl_object = obj; + ddl->ddl_flags = DLH_GET_FLAGS(&hdr); + ddl->ddl_length = hdr.dlh_length; + ddl->ddl_first_txg = hdr.dlh_first_txg; + + if (ddl->ddl_flags & DDL_FLAG_FLUSHING) + ddt->ddt_log_flushing = ddl; + else + ddt->ddt_log_active = ddl; + + return (0); +} + +int +ddt_log_load(ddt_t *ddt) +{ + int err; + + if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { + /* + * The DDT is going to be freed again in a moment, so there's + * no point loading the log; it'll just slow down import. + */ + return (0); + } + + ASSERT0(ddt->ddt_log[0].ddl_object); + ASSERT0(ddt->ddt_log[1].ddl_object); + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then the log object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); + return (SET_ERROR(ENOENT)); + } + + if ((err = ddt_log_load_one(ddt, 0)) != 0) + return (err); + if ((err = ddt_log_load_one(ddt, 1)) != 0) + return (err); + + VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); + VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); + VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); + VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); + + /* + * We have two finalisation tasks: + * + * - rebuild the histogram. We do this at the end rather than while + * we're loading so we don't need to uncount and recount entries that + * appear multiple times in the log. + * + * - remove entries from the flushing tree that are on both trees. This + * happens when ddt_lookup() rehydrates an entry from the flushing + * tree, as ddt_log_take_key() removes the entry from the in-memory + * tree but doesn't remove it from disk. + */ + + /* + * We don't technically need a config lock here, since there shouldn't + * be pool config changes during DDT load. dva_get_dsize_sync() via + * ddt_stat_generate() is expecting it though, and it won't hurt + * anything, so we take it. + */ + spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); + + avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; + avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; + ddt_log_entry_t *ae = avl_first(al); + ddt_log_entry_t *fe = avl_first(fl); + while (ae != NULL || fe != NULL) { + ddt_log_entry_t *ddle; + if (ae == NULL) { + /* active exhausted, take flushing */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + } else if (fe == NULL) { + /* flushing exuhausted, take active */ + ddle = ae; + ae = AVL_NEXT(al, ae); + } else { + /* compare active and flushing */ + int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); + if (c < 0) { + /* active behind, take and advance */ + ddle = ae; + ae = AVL_NEXT(al, ae); + } else if (c > 0) { + /* flushing behind, take and advance */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + } else { + /* match. remove from flushing, take active */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + avl_remove(fl, ddle); + + ddle = ae; + ae = AVL_NEXT(al, ae); + } + } + + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + } + + spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); + + ddt_log_update_stats(ddt); + + return (0); +} + +void +ddt_log_alloc(ddt_t *ddt) +{ + ASSERT3P(ddt->ddt_log_active, ==, NULL); + ASSERT3P(ddt->ddt_log_flushing, ==, NULL); + + avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, + sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); + avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, + sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); + ddt->ddt_log_active = &ddt->ddt_log[0]; + ddt->ddt_log_flushing = &ddt->ddt_log[1]; + ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; +} + +void +ddt_log_free(ddt_t *ddt) +{ + ddt_log_empty(ddt, &ddt->ddt_log[0]); + ddt_log_empty(ddt, &ddt->ddt_log[1]); + avl_destroy(&ddt->ddt_log[0].ddl_tree); + avl_destroy(&ddt->ddt_log[1].ddl_tree); +} + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, + "Max transactions before starting to flush dedup logs"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, + "Max memory for dedup logs"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, + "Max memory for dedup logs, as % of total memory"); diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 9316200f2..8f55bc24f 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, memset(dds, 0, sizeof (*dds)); - for (int p = 0; p < ddlwe->ddlwe_nphys; p++) { + for (int p = 0; p < DDT_NPHYS(ddt); p++) { const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); @@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) ddo_total->ddo_mspace += ddo->ddo_mspace; } } + + ddt_object_t *ddo = &ddt->ddt_log_stats; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; } /* @@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) &ddt->ddt_histogram_cache[type][class]); } } + + ddt_histogram_add(ddh, &ddt->ddt_log_histogram); } } diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 3de316a12..96943421f 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special +DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh index 83c4d7c8e..4f6e5805b 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh @@ -29,9 +29,16 @@ log_assert "basic dedup (FDT) operations work" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh index f0f20671b..259eaddc0 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh @@ -29,9 +29,16 @@ log_assert "dedup (FDT) retains version after import" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh index 049ccaae3..114cf0266 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh @@ -30,9 +30,16 @@ log_assert "legacy and FDT dedup tables on the same pool can happily coexist" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh index d563fade8..c36463134 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh @@ -30,9 +30,16 @@ log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT" +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index 5b83a1ca3..326152b51 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -51,6 +51,12 @@ POOL="dedup_pool" save_tunable TXG_TIMEOUT +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { if poolexists $POOL ; then @@ -58,6 +64,7 @@ function cleanup fi log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR log_must restore_tunable TXG_TIMEOUT + log_must restore_tunable DEDUP_LOG_TXG_MAX } @@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit # # With no DDT quota in place, the above workload will produce over - # 800,000 entries by using space in the normal class. With a quota, - # it will be well below 500,000 entries. + # 800,000 entries by using space in the normal class. With a quota, it + # should be well under 500,000. However, logged entries are hard to + # account for because they can appear on both logs, and can also + # represent an eventual removal. This isn't easily visible from + # outside, and even internally can result in going slightly over quota. + # For here, we just set the entry count a little higher than what we + # expect to allow for some instability. # - log_must test $(ddt_entries) -le 500000 + log_must test $(ddt_entries) -le 600000 do_clean } |