17 files changed, 1600 insertions, 110 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 250052adf..c72df3909 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1959,6 +1959,32 @@ dump_dedup_ratio(const ddt_stat_t *dds)
 }
 
 static void
+dump_ddt_log(ddt_t *ddt)
+{
+	for (int n = 0; n < 2; n++) {
+		ddt_log_t *ddl = &ddt->ddt_log[n];
+
+		uint64_t count = avl_numnodes(&ddl->ddl_tree);
+		if (count == 0)
+			continue;
+
+		printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
+		    zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
+
+		if (dump_opt['D'] < 4)
+			continue;
+
+		ddt_lightweight_entry_t ddlwe;
+		uint64_t index = 0;
+		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
+			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+			dump_ddt_entry(ddt, &ddlwe, index++);
+		}
+	}
+}
+
+static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
@@ -2027,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
 				dump_ddt(ddt, type, class);
 			}
 		}
+		dump_ddt_log(ddt);
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
@@ -5743,7 +5770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
 		ddt_phys_decref(dde->dde_phys, v);
 
 		/*
@@ -8120,6 +8147,10 @@ dump_mos_leaks(spa_t *spa)
 
 		/* FDT container */
 		mos_obj_refd(ddt->ddt_dir_object);
+
+		/* FDT log objects */
+		mos_obj_refd(ddt->ddt_log[0].ddl_object);
+		mos_obj_refd(ddt->ddt_log[1].ddl_object);
 	}
 
 	if (spa->spa_brt != NULL) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 2dd18526d..2fc798725 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -43,7 +43,8 @@ struct abd;
  * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
  */
 #define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
-#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT)
+#define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)
 
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
@@ -209,6 +210,7 @@ typedef enum {
 /* State flags for dde_flags */
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
+#define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */
 
 /*
  * Additional data to support entry update or repair. This is fixed size
@@ -255,6 +257,19 @@ typedef struct {
 } ddt_lightweight_entry_t;
 
 /*
+ * In-core DDT log. A separate struct to make it easier to switch between the
+ * appending and flushing logs.
+ */
+typedef struct {
+	avl_tree_t	ddl_tree;	/* logged entries */
+	uint32_t	ddl_flags;	/* flags for this log */
+	uint64_t	ddl_object;	/* log object id */
+	uint64_t	ddl_length;	/* on-disk log size */
+	uint64_t	ddl_first_txg;	/* txg log became active */
+	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
+} ddt_log_t;
+
+/*
  * In-core DDT object. This covers all entries and stats for a the whole pool
  * for a given checksum type.
  */
@@ -262,8 +277,22 @@ typedef struct {
 	kmutex_t	ddt_lock;	/* protects changes to all fields */
 
 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
+	avl_tree_t	ddt_log_tree;	/* logged entries */
 
-	avl_tree_t	ddt_repair_tree; /* entries being repaired */
+	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
+
+	ddt_log_t	ddt_log[2];		/* active/flushing logs */
+	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
+	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
+
+	hrtime_t	ddt_flush_start;	/* log flush start this txg */
+	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
+
+	int32_t		ddt_flush_count;	/* entries flushed this txg */
+	int32_t		ddt_flush_min;		/* min rem entries to flush */
+	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
+	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
+	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
 
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
@@ -276,13 +305,17 @@ typedef struct {
 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
 
-	/* object ids for whole-ddt and per-type/per-class stats */
+	/* object ids for stored, logged and per-type/per-class stats */
 	uint64_t	ddt_stat_object;
+	ddt_object_t	ddt_log_stats;
 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
 
 	/* type/class stats by power-2-sized referenced blocks */
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+
+	/* log stats power-2-sized referenced blocks */
+	ddt_histogram_t	ddt_log_histogram;
 } ddt_t;
 
 /*
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index ce4bc559d..6f11cd90c 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -28,6 +28,7 @@
 #define	_SYS_DDT_IMPL_H
 
 #include <sys/ddt.h>
+#include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -50,6 +51,106 @@ extern "C" {
 	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
 } while (0)
 
+#define	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do {             \
+	memset((ddlwe), 0, sizeof (*ddlwe));                            \
+	(ddlwe)->ddlwe_key = (ddle)->ddle_key;                          \
+	(ddlwe)->ddlwe_type = (ddle)->ddle_type;                        \
+	(ddlwe)->ddlwe_class = (ddle)->ddle_class;                      \
+	memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+/*
+ * An entry on the log tree. These are "frozen", and a record of what's in
+ * the on-disk log. They can't be used in place, but can be "loaded" back into
+ * the live tree.
+ */
+typedef struct {
+	ddt_key_t	ddle_key;	/* ddt_log_tree key */
+	avl_node_t	ddle_node;	/* ddt_log_tree node */
+
+	ddt_type_t	ddle_type;	/* storage type */
+	ddt_class_t	ddle_class;	/* storage class */
+
+	/* extra allocation for flat/trad phys */
+	ddt_univ_phys_t	ddle_phys[];
+} ddt_log_entry_t;
+
+/* On-disk log record types. */
+typedef enum {
+	DLR_INVALID	= 0,	/* end of block marker */
+	DLR_ENTRY	= 1,	/* an entry to add or replace in the log tree */
+} ddt_log_record_type_t;
+
+/* On-disk log record header. */
+typedef struct {
+	/*
+	 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:    record type (ddt_log_record_type_t)
+	 * bits 8-15:  length of record header+payload
+	 * bits 16-47:  reserved, all zero
+	 * bits 48-55:   if type==DLR_ENTRY, storage type (ddt_type)
+	 *                otherwise all zero
+	 * bits 56-63:  if type==DLR_ENTRY, storage class (ddt_class)
+	 *                otherwise all zero
+	 */
+	uint64_t	dlr_info;
+	uint8_t		dlr_payload[];
+} ddt_log_record_t;
+
+#define	DLR_GET_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 0, 8)
+#define	DLR_SET_TYPE(dlr, v)		BF64_SET((dlr)->dlr_info, 0, 8, v)
+#define	DLR_GET_RECLEN(dlr)		BF64_GET((dlr)->dlr_info, 8, 16)
+#define	DLR_SET_RECLEN(dlr, v)		BF64_SET((dlr)->dlr_info, 8, 16, v)
+#define	DLR_GET_ENTRY_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 48, 8)
+#define	DLR_SET_ENTRY_TYPE(dlr, v)	BF64_SET((dlr)->dlr_info, 48, 8, v)
+#define	DLR_GET_ENTRY_CLASS(dlr)	BF64_GET((dlr)->dlr_info, 56, 8)
+#define	DLR_SET_ENTRY_CLASS(dlr, v)	BF64_SET((dlr)->dlr_info, 56, 8, v)
+
+/* Payload for DLR_ENTRY. */
+typedef struct {
+	ddt_key_t	dlre_key;
+	ddt_univ_phys_t	dlre_phys[];
+} ddt_log_record_entry_t;
+
+/* Log flags (ddl_flags, dlh_flags) */
+#define	DDL_FLAG_FLUSHING	(1 << 0)	/* this log is being flushed */
+#define	DDL_FLAG_CHECKPOINT	(1 << 1)	/* header has a checkpoint */
+
+/* On-disk log header, stored in the bonus buffer. */
+typedef struct {
+	/*
+	 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:   log version
+	 * bits 8-15:  log flags
+	 * bits 16-63: reserved, all zero
+	 */
+	uint64_t	dlh_info;
+
+	uint64_t	dlh_length;	/* log size in bytes */
+	uint64_t	dlh_first_txg;	/* txg this log went active */
+	ddt_key_t	dlh_checkpoint;	/* last checkpoint */
+} ddt_log_header_t;
+
+#define	DLH_GET_VERSION(dlh)	BF64_GET((dlh)->dlh_info, 0, 8)
+#define	DLH_SET_VERSION(dlh, v)	BF64_SET((dlh)->dlh_info, 0, 8, v)
+#define	DLH_GET_FLAGS(dlh)	BF64_GET((dlh)->dlh_info, 8, 8)
+#define	DLH_SET_FLAGS(dlh, v)	BF64_SET((dlh)->dlh_info, 8, 8, v)
+
+/* DDT log update state */
+typedef struct {
+	dmu_tx_t	*dlu_tx;	/* tx the update is being applied to */
+	dnode_t		*dlu_dn;	/* log object dnode */
+	dmu_buf_t	**dlu_dbp;	/* array of block buffer pointers */
+	int		dlu_ndbp;	/* number of block buffer pointers */
+	uint16_t	dlu_reclen;	/* cached length of record */
+	uint64_t	dlu_block;	/* block for next entry */
+	uint64_t	dlu_offset;	/* offset for next entry */
+} ddt_log_update_t;
+
 /*
  * Ops vector to access a specific DDT object type.
  */
@@ -77,6 +178,33 @@ typedef struct {
 
 extern const ddt_ops_t ddt_zap_ops;
 
+/* Dedup log API */
+extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
+    ddt_log_update_t *dlu);
+extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
+    ddt_log_update_t *dlu);
+extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
+
+extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    dmu_tx_t *tx);
+extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
+
+extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
+
+extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
+
+extern int ddt_log_load(ddt_t *ddt);
+extern void ddt_log_alloc(ddt_t *ddt);
+extern void ddt_log_free(ddt_t *ddt);
+
+extern void ddt_log_init(void);
+extern void ddt_log_fini(void);
+
 /*
  * These are only exposed so that zdb can access them. Try not to use them
  * outside of the DDT implementation proper, and if you do, consider moving
@@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops;
  */
 #define	DDT_NAMELEN	32
 
-extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
+    const ddt_univ_phys_t *ddp);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 5b80dc315..928f5f2b4 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -375,6 +375,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 42f3404db..070dc0132 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/dbuf.c \
 	module/zfs/dbuf_stats.c \
 	module/zfs/ddt.c \
+	module/zfs/ddt_log.c \
 	module/zfs/ddt_stats.c \
 	module/zfs/ddt_zap.c \
 	module/zfs/dmu.c \
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 45b6c338a..aae3d7dfb 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -974,6 +974,88 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
+.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
+Maximum number of dedup log flush passes (iterations) each transaction.
+.Pp
+At the start of each transaction, OpenZFS will estimate how many entries it
+needs to flush out to keep up with the change rate, taking the amount and time
+taken to flush on previous txgs into account (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+It will spread this amount into a number of passes.
+At each pass, it will use the amount already flushed and the total time taken
+by flushing and by other IO to recompute how much it should do for the remainder
+of the txg.
+.Pp
+Reducing the max number of passes will make flushing more aggressive, flushing
+out more entries on each pass.
+This can be faster, but also more likely to compete with other IO.
+Increasing the max number of passes will put fewer entries onto each pass,
+keeping the overhead of dedup changes to a minimum but possibly causing a large
+number of changes to be dumped on the last pass, which can blow out the txg
+sync time beyond
+.Sy zfs_txg_timeout .
+.
+.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
+Minimum time to spend on dedup log flush each transaction.
+.Pp
+At least this long will be spent flushing dedup log entries each transaction,
+up to
+.Sy zfs_txg_timeout .
+This occurs even if doing so would delay the transaction, that is, other IO
+completes under this time.
+.
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+Flush at least this many entries each transaction.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction to
+keep up with the ingest rate (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+This sets the minimum for that estimate.
+Raising it can force OpenZFS to flush more aggressively, keeping the log small
+and so reducing pool import times, but can make it less able to back off if
+log flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
+Number of transactions to use to compute the flow rate.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction by
+monitoring the number of entries changed (ingest rate), number of entries
+flushed (flush rate) and time spent flushing (flush time rate) and combining
+these into an overall "flow rate".
+It will use an exponential weighted moving average over some number of recent
+transactions to compute these rates.
+This sets the number of transactions to compute these averages over.
+Setting it higher can help to smooth out the flow rate in the face of spiky
+workloads, but will take longer for the flow rate to adjust to a sustained
+change in the ingress rate.
+.
+.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
+Max transactions to before starting to flush dedup logs.
+.Pp
+OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
+If there is nothing to flush, it will accumulate changes for no more than this
+many transactions before switching the logs and starting to flush entries out.
+.
+.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
+Max memory to use for dedup logs.
+.Pp
+OpenZFS will spend no more than this much memory on maintaining the in-memory
+dedup log.
+Flushing will begin when around half this amount is being spent on logs.
+The default value of
+.Sy 0
+will cause it to be set by
+.Sy zfs_dedup_log_mem_max_percent
+instead.
+.
+.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
+Max memory to use for dedup logs, as a percentage of total memory.
+.Pp
+If
+.Sy zfs_dedup_log_mem_max
+is not set, it will be initialised as a percentage of the total memory in the
+system.
+.
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 57682214d..a119198db 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -322,6 +322,7 @@ ZFS_OBJS := \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
+	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index d9d31564d..534f32571 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -252,6 +252,7 @@ SRCS+=	abd.c \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast
 
 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.ddt_zap.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 26e127d61..ce5c4efb5 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -125,6 +125,28 @@
  * without which, no space would be recovered and the DDT would continue to be
  * considered "over quota". See zap_shrink_enabled.
  *
+ * ## Dedup log
+ *
+ * Historically, all entries modified on a txg were written back to dedup
+ * storage objects at the end of every txg. This could cause significant
+ * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
+ * and so required reading the whole node, updating the entry, and writing it
+ * back. On busy pools, this could add serious IO and memory overheads.
+ *
+ * To address this, the dedup log was added. If the "fast_dedup" feature is
+ * enabled, at the end of each txg, modified entries will be copied to an
+ * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
+ * same block is requested again, the in-memory object will be checked first,
+ * and if its there, the entry inflated back onto the live tree without going
+ * to storage. The on-disk log is only read at pool import time, to reload the
+ * in-memory log.
+ *
+ * Each txg, some amount of the in-memory log will be flushed out to a DDT
+ * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
+ * keep up with the rate of change on dedup entries, but not so much that it
+ * would impact overall throughput, and not using too much memory. See the
+ * zfs_dedup_log_* tuneables in zfs(4) for more details.
+ *
  * ## Repair IO
  *
  * If a read on a dedup block fails, but there are other copies of the block in
@@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0;
 uint_t dedup_class_wait_txgs = 5;
 
 
+/*
+ * Don't do more than this many incremental flush passes per txg.
+ */
+uint_t zfs_dedup_log_flush_passes_max = 8;
+
+/*
+ * Minimum time to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_min_time_ms = 1000;
+
+/*
+ * Minimum entries to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_entries_min = 1000;
+
+/*
+ * Number of txgs to average flow rates across.
+ */
+uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
+
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
@@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
  */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };
 
 /* Dummy version to signal that configure is still necessary */
@@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 
 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
+	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
 }
 
 static int
@@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 }
 
 uint64_t
-ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
 {
 	uint64_t refcnt = 0;
 
-	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
-		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
-	} else {
-		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
-			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
-	}
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		refcnt = ddp->ddp_flat.ddp_refcnt;
+	else
+		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
+			refcnt += ddp->ddp_trad[v].ddp_refcnt;
 
 	return (refcnt);
 }
@@ -743,11 +784,15 @@ ddt_init(void)
 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	ddt_log_init();
 }
 
 void
 ddt_fini(void)
 {
+	ddt_log_fini();
+
 	kmem_cache_destroy(ddt_entry_trad_cache);
 	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
@@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
+	/* Entry is still in the log, so charge the entry back to it */
+	if (dde->dde_flags & DDE_FLAG_LOGGED) {
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(ddt, dde);
 }
@@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 
 	avl_insert(&ddt->ddt_tree, dde, where);
 
+	/* If its in the log tree, we can "load" it from there */
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		ddt_lightweight_entry_t ddlwe;
+
+		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
+		    &search, &ddlwe) ||
+		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
+
+			dde->dde_type = ddlwe.ddlwe_type;
+			dde->dde_class = ddlwe.ddlwe_class;
+			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
+			    DDT_PHYS_SIZE(ddt));
+
+			return (dde);
+		}
+	}
+
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
@@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
 		/*
-		 * The histograms only track inactive (stored) blocks.
+		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
 		 * remove its counts. When its synced back, it'll be re-added
 		 * to the right one.
+		 *
+		 * We only do this when we successfully found it in the store.
+		 * error == ENOENT means this is a new entry, and so its already
+		 * not counted.
 		 */
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
 		}
 	}
 
+	ddt_log_destroy(ddt, tx);
+
 	uint64_t count;
 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
@@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 
 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
-
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
+	ddt_log_alloc(ddt);
+
 	return (ddt);
 }
 
 static void
 ddt_table_free(ddt_t *ddt)
 {
+	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
@@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa)
 			}
 		}
 
+		error = ddt_log_load(ddt);
+		if (error != 0 && error != ENOENT)
+			return (error);
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1483,145 +1567,447 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 }
 
 static void
-ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/*
+	 * Count all the entries stored for each type/class, and updates the
+	 * stats within (ddt_object_sync()). If there's no entries for the
+	 * type/class, the whole object is removed. If all objects for the DDT
+	 * are removed, its containing dir is removed, effectively resetting
+	 * the entire DDT to an empty slate.
+	 */
+	uint64_t count = 0;
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		uint64_t add, tcount = 0;
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			if (ddt_object_exists(ddt, type, class)) {
+				ddt_object_sync(ddt, type, class, tx);
+				VERIFY0(ddt_object_count(ddt, type, class,
+				    &add));
+				tcount += add;
+			}
+		}
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			if (tcount == 0 && ddt_object_exists(ddt, type, class))
+				ddt_object_destroy(ddt, type, class, tx);
+		}
+		count += tcount;
+	}
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		/* Include logged entries in the total count */
+		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
+		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	}
+
+	if (count == 0) {
+		/*
+		 * No entries left on the DDT, so reset the version for next
+		 * time. This allows us to handle the feature being changed
+		 * since the DDT was originally created. New entries should get
+		 * whatever the feature currently demands.
+		 */
+		if (ddt->ddt_version == DDT_VERSION_FDT)
+			ddt_destroy_dir(ddt, tx);
+
+		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+		ddt->ddt_flags = 0;
+	}
+
+	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+	    sizeof (ddt->ddt_histogram));
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+}
+
+static void
+ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_key_t *ddk = &dde->dde_key;
-	ddt_type_t otype = dde->dde_type;
-	ddt_type_t ntype = DDT_TYPE_DEFAULT;
-	ddt_class_t oclass = dde->dde_class;
-	ddt_class_t nclass;
-	uint64_t total_refcnt = 0;
 
-	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+	/*
+	 * Compute the target class, so we can decide whether or not to inform
+	 * the scrub traversal (below). Note that we don't store this in the
+	 * entry, as it might change multiple times before finally being
+	 * committed (if we're logging). Instead, we recompute it in
+	 * ddt_sync_entry().
+	 */
+	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If the class changes, the order that we scan this bp changes. If it
+	 * decreases, we could miss it, so scan it right now. (This covers both
+	 * class changing while we are doing ddt_walk(), and when we are
+	 * traversing.)
+	 *
+	 * We also do this when the refcnt goes to zero, because that change is
+	 * only in the log so far; the blocks on disk won't be freed until
+	 * the log is flushed, and the refcnt might increase before that. If it
+	 * does, then we could miss it in the same way.
+	 */
+	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
+		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
+		    ddlwe, tx);
+}
+
+static void
+ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
+{
+	ddt_key_t *ddk = &ddlwe->ddlwe_key;
+	ddt_type_t ntype = DDT_TYPE_DEFAULT;
+	uint64_t refcnt = 0;
 
+	/*
+	 * Compute the total refcnt. Along the way, issue frees for any DVAs
+	 * we no longer want.
+	 */
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT(dde->dde_io == NULL ||
-		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
 
 		if (ddt_phys_birth(ddp, v) == 0) {
-			ASSERT0(phys_refcnt);
+			ASSERT3U(phys_refcnt, ==, 0);
 			continue;
 		}
 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
 			/*
-			 * Note, we no longer create DDT-DITTO blocks, but we
-			 * don't want to leak any written by older software.
+			 * We don't want to keep any obsolete slots (eg ditto),
+			 * regardless of their refcount, but we don't want to
+			 * leak them either. So, free them.
 			 */
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
 			continue;
 		}
 		if (phys_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-		total_refcnt += phys_refcnt;
+			/* No remaining references, free it! */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+		refcnt += phys_refcnt;
 	}
 
-	if (total_refcnt > 1)
-		nclass = DDT_CLASS_DUPLICATE;
-	else
-		nclass = DDT_CLASS_UNIQUE;
+	/* Select the best class for the entry. */
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
 
+	/*
+	 * If an existing entry changed type or class, or its refcount reached
+	 * zero, delete it from the DDT object
+	 */
 	if (otype != DDT_TYPES &&
-	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+	    (otype != ntype || oclass != nclass || refcnt == 0)) {
 		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
-		ASSERT3U(
-		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
+		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
 	}
 
-	if (total_refcnt != 0) {
-		dde->dde_type = ntype;
-		dde->dde_class = nclass;
+	/*
+	 * Add or update the entry
+	 */
+	if (refcnt != 0) {
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+
+		ddt_histogram_add_entry(ddt, ddh, ddlwe);
 
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
+		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
+	}
+}
 
-		ddt_lightweight_entry_t ddlwe;
-		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+/* Calculate an exponential weighted moving average, lower limited to zero */
+static inline int32_t
+_ewma(int32_t val, int32_t prev, uint32_t weight)
+{
+	ASSERT3U(val, >=, 0);
+	ASSERT3U(prev, >=, 0);
+	const int32_t new =
+	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
+	ASSERT3U(new, >=, 0);
+	return (new);
+}
 
-		ddt_histogram_t *ddh =
-		    &ddt->ddt_histogram[ntype][nclass];
-		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
+/* Returns true if done for this txg */
+static boolean_t
+ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (ddt->ddt_flush_pass == 0) {
+		if (spa_sync_pass(ddt->ddt_spa) == 1) {
+			/* First run this txg, get set up */
+			ddt->ddt_flush_start = gethrtime();
+			ddt->ddt_flush_count = 0;
 
+			/*
+			 * How many entries we need to flush. We want to at
+			 * least match the ingest rate.
+			 */
+			ddt->ddt_flush_min = MAX(
+			    ddt->ddt_log_ingest_rate,
+			    zfs_dedup_log_flush_entries_min);
+		} else {
+			/* We already decided we're done for this txg */
+			return (B_FALSE);
+		}
+	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
 		/*
-		 * If the class changes, the order that we scan this bp
-		 * changes.  If it decreases, we could miss it, so
-		 * scan it right now.  (This covers both class changing
-		 * while we are doing ddt_walk(), and when we are
-		 * traversing.)
+		 * We already did some flushing on this pass, skip it. This
+		 * happens when dsl_process_async_destroys() runs during a scan
+		 * (on pass 1) and does an additional ddt_sync() to update
+		 * freed blocks.
 		 */
-		if (nclass < oclass) {
-			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, ddt, &ddlwe, tx);
-		}
+		return (B_FALSE);
 	}
+
+	if (spa_sync_pass(ddt->ddt_spa) >
+	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
+		/* Too many passes this txg, defer until next. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* Nothing to flush, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
+	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
+
+	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
+
+	if (elapsed_time >= target_time) {
+		/* Too long since we started, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	ddt->ddt_flush_pass++;
+	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
+
+	/*
+	 * Estimate how much time we'll need to flush the remaining entries
+	 * based on how long it normally takes.
+	 */
+	uint32_t want_time;
+	if (ddt->ddt_flush_pass == 1) {
+		/* First pass, use the average time/entries */
+		if (ddt->ddt_log_flush_rate == 0)
+			/* Zero rate, just assume the whole time */
+			want_time = target_time;
+		else
+			want_time = ddt->ddt_flush_min *
+			    ddt->ddt_log_flush_time_rate /
+			    ddt->ddt_log_flush_rate;
+	} else {
+		/* Later pass, calculate from this txg so far */
+		want_time = ddt->ddt_flush_min *
+		    elapsed_time / ddt->ddt_flush_count;
+	}
+
+	/* Figure out how much time we have left */
+	uint32_t remain_time = target_time - elapsed_time;
+
+	/* Smear the remaining entries over the remaining passes. */
+	uint32_t nentries = ddt->ddt_flush_min /
+	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
+	if (want_time > remain_time) {
+		/*
+		 * We're behind; try to catch up a bit by doubling the amount
+		 * this pass. If we're behind that means we're in a later
+		 * pass and likely have most of the remaining time to
+		 * ourselves. If we're in the last couple of passes, then
+		 * doubling might just take us over the timeout, but probably
+		 * not be much, and it stops us falling behind. If we're
+		 * in the middle passes, there'll be more to do, but it
+		 * might just help us catch up a bit and we'll recalculate on
+		 * the next pass anyway.
+		 */
+		nentries = MIN(ddt->ddt_flush_min, nentries*2);
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	uint32_t count = 0;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End this pass if we've synced as much as we need to. */
+		if (++count >= nentries)
+			break;
+	}
+	ddt->ddt_flush_count += count;
+	ddt->ddt_flush_min -= count;
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		ddt_log_truncate(ddt, tx);
+		/* No more passes needed this txg */
+		ddt->ddt_flush_pass = 0;
+	} else
+		/* More to do next time, save checkpoint */
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+
+	ddt_sync_update_stats(ddt, tx);
+
+	return (ddt->ddt_flush_pass == 0);
 }
 
 static void
-ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde;
-	void *cookie = NULL;
+	ASSERT(avl_is_empty(&ddt->ddt_tree));
 
-	if (avl_numnodes(&ddt->ddt_tree) == 0)
+	/* Don't do any flushing when the pool is ready to shut down */
+	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
 		return;
 
-	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
+	/* Try to flush some. */
+	if (!ddt_sync_flush_log_incremental(ddt, tx))
+		/* More to do next time */
+		return;
 
-	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, tx);
+	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
+		/*
+		 * No more to flush, and the active list has stuff, so
+		 * try to swap the logs for next time.
+		 */
+		(void) ddt_log_swap(ddt, tx);
 	}
 
-	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
-		ddt_create_dir(ddt, tx);
+	/*
+	 * Update flush rate. This is an exponential weighted moving average of
+	 * the number of entries flushed over recent txgs.
+	 */
+	ddt->ddt_log_flush_rate = _ewma(
+	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	    zfs_dedup_log_flush_flow_rate_txgs);
 
-	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
-		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(ddt, dde);
-	}
+	/*
+	 * Update flush time rate. This is an exponential weighted moving
+	 * average of the total time taken to flush over recent txgs.
+	 */
+	ddt->ddt_log_flush_time_rate = _ewma(
+	    ddt->ddt_log_flush_time_rate,
+	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	    zfs_dedup_log_flush_flow_rate_txgs);
+}
 
-	uint64_t count = 0;
-	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
-		uint64_t add, tcount = 0;
-		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
-			if (ddt_object_exists(ddt, type, class)) {
-				ddt_object_sync(ddt, type, class, tx);
-				VERIFY0(ddt_object_count(ddt, type, class,
-				    &add));
-				tcount += add;
-			}
+static void
+ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	uint64_t count = avl_numnodes(&ddt->ddt_tree);
+
+	if (count > 0) {
+		ddt_log_update_t dlu = {0};
+		ddt_log_begin(ddt, count, tx, &dlu);
+
+		ddt_entry_t *dde;
+		void *cookie = NULL;
+		ddt_lightweight_entry_t ddlwe;
+		while ((dde =
+		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+			ddt_log_entry(ddt, &ddlwe, &dlu);
+			ddt_sync_scan_entry(ddt, &ddlwe, tx);
+			ddt_free(ddt, dde);
 		}
-		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
-			if (tcount == 0 && ddt_object_exists(ddt, type, class))
-				ddt_object_destroy(ddt, type, class, tx);
+
+		ddt_log_commit(ddt, &dlu);
+
+		/*
+		 * Sync the stats for the store objects. Even though we haven't
+		 * modified anything on those objects, they're no longer the
+		 * source of truth for entries that are now in the log, and we
+		 * need the on-disk counts to reflect that, otherwise we'll
+		 * miscount later when importing.
+		 */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0;
+			    class < DDT_CLASSES; class++) {
+				if (ddt_object_exists(ddt, type, class))
+					ddt_object_sync(ddt, type, class, tx);
+			}
 		}
-		count += tcount;
+
+		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+		    sizeof (ddt->ddt_histogram));
+		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 	}
 
-	if (count == 0) {
+	if (spa_sync_pass(ddt->ddt_spa) == 1)
 		/*
-		 * No entries left on the DDT, so reset the version for next
-		 * time. This allows us to handle the feature being changed
-		 * since the DDT was originally created. New entries should get
-		 * whatever the feature currently demands.
+		 * Update ingest rate. This is an exponential weighted moving
+		 * average of the number of entries changed over recent txgs.
+		 * The ramp-up cost shouldn't matter too much because the
+		 * flusher will be trying to take at least the minimum anyway.
 		 */
-		if (ddt->ddt_version == DDT_VERSION_FDT)
-			ddt_destroy_dir(ddt, tx);
+		ddt->ddt_log_ingest_rate = _ewma(
+		    count, ddt->ddt_log_ingest_rate,
+		    zfs_dedup_log_flush_flow_rate_txgs);
+}
 
-		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
-		ddt->ddt_flags = 0;
+static void
+ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+	while ((dde = avl_destroy_nodes(
+	    &ddt->ddt_tree, &cookie)) != NULL) {
+		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    dde->dde_type, dde->dde_class, tx);
+		ddt_sync_scan_entry(ddt, &ddlwe, tx);
+		ddt_free(ddt, dde);
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
-	spa->spa_dedup_dspace = ~0ULL;
-	spa->spa_dedup_dsize = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	ddt_sync_update_stats(ddt, tx);
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+
+	if (ddt->ddt_version == UINT64_MAX)
+		return;
+
+	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
+		ASSERT0(avl_numnodes(&ddt->ddt_tree));
+		return;
+	}
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
+	}
+
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG)
+		ddt_sync_table_log(ddt, tx);
+	else
+		ddt_sync_table_flush(ddt, tx);
 }
 
 void
@@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg)
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
-		ddt_sync_table(ddt, tx, txg);
+		ddt_sync_table(ddt, tx);
+		if (ddt->ddt_flags & DDT_FLAG_LOG)
+			ddt_sync_flush_log(ddt, tx);
 		ddt_repair_table(ddt, rio);
 	}
 
@@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		return (B_FALSE);
 	}
 
-	if (dde->dde_type < DDT_TYPES) {
-		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
-
+	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
+		/*
+		 * This entry was either synced to a store object (dde_type is
+		 * real) or was logged. It must be properly on disk at this
+		 * point, so we can just bump its refcount.
+		 */
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
@@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * we may have a block with the DEDUP set, but which doesn't
 		 * have a corresponding entry in the DDT. Be ready.
 		 */
-		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
@@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
+	"Max number of incremental dedup log flush passes per transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
+	"Min time to spend on incremental dedup log flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
+	"Min number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
+	"Number of txgs to average flow rates across");
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
new file mode 100644
index 000000000..7e7ff9e5b
--- /dev/null
+++ b/module/zfs/ddt_log.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time.
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	if (!(too_large || too_old))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 9316200f2..8f55bc24f 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
@@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}
 
 	/*
@@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }
 
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 3de316a12..96943421f 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
+DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
index 83c4d7c8e..4f6e5805b 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -29,9 +29,16 @@
 
 log_assert "basic dedup (FDT) operations work"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
index f0f20671b..259eaddc0 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -29,9 +29,16 @@
 
 log_assert "dedup (FDT) retains version after import"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
index 049ccaae3..114cf0266 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -30,9 +30,16 @@
 
 log_assert "legacy and FDT dedup tables on the same pool can happily coexist"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
index d563fade8..c36463134 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -30,9 +30,16 @@
 
 log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
index 5b83a1ca3..326152b51 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
@@ -51,6 +51,12 @@ POOL="dedup_pool"
 
 save_tunable TXG_TIMEOUT
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	if poolexists $POOL ; then
@@ -58,6 +64,7 @@ function cleanup
 	fi
 	log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
 	log_must restore_tunable TXG_TIMEOUT
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 
@@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit
 
 	#
 	# With no DDT quota in place, the above workload will produce over
-	# 800,000 entries by using space in the normal class. With a quota,
-	# it will be well below 500,000 entries.
+	# 800,000 entries by using space in the normal class. With a quota, it
+	# should be well under 500,000. However, logged entries are hard to
+	# account for because they can appear on both logs, and can also
+	# represent an eventual removal. This isn't easily visible from
+	# outside, and even internally can result in going slightly over quota.
+	# For here, we just set the entry count a little higher than what we
+	# expect to allow for some instability.
 	#
-	log_must test $(ddt_entries) -le 500000
+	log_must test $(ddt_entries) -le 600000
 
 	do_clean
 }