aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDon Brady <[email protected]>2024-06-17 22:35:18 +0000
committerBrian Behlendorf <[email protected]>2024-09-04 14:17:02 -0700
commitd4d79451cb87aa0d93f9068ce5844098a5ebe3b5 (patch)
tree7d9db775172947aaaefce2869dcb5871b3f5986d
parent4a4f7b019fa57e2a196e95492aecbed1f312be3a (diff)
Add DDT prune command
Requires the new 'flat' physical data which has the start time for a class entry. The amount to prune can be based on a target percentage of the unique entries or based on the age (i.e., every entry older than N days). Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #16277
-rw-r--r--cmd/zdb/zdb.c55
-rw-r--r--cmd/zpool/zpool_main.c89
-rw-r--r--cmd/ztest.c28
-rw-r--r--contrib/debian/openzfs-zfsutils.install1
-rw-r--r--include/libzfs.h3
-rw-r--r--include/libzfs_core.h3
-rw-r--r--include/sys/ddt.h3
-rw-r--r--include/sys/ddt_impl.h52
-rw-r--r--include/sys/fs/zfs.h15
-rw-r--r--include/sys/spa_impl.h1
-rw-r--r--lib/libzfs/libzfs.abi67
-rw-r--r--lib/libzfs/libzfs_pool.c28
-rw-r--r--lib/libzfs_core/libzfs_core.abi15
-rw-r--r--lib/libzfs_core/libzfs_core.c22
-rw-r--r--man/Makefile.am1
-rw-r--r--man/man8/zpool-ddtprune.848
-rw-r--r--man/man8/zpool.81
-rw-r--r--module/zfs/ddt.c474
-rw-r--r--module/zfs/ddt_log.c24
-rw-r--r--module/zfs/zfs_ioctl.c50
-rw-r--r--module/zfs/zio.c10
21 files changed, 905 insertions, 85 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 41c2b6765..8e3b6972a 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2045,7 +2045,7 @@ dump_all_ddts(spa_t *spa)
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
- if (!ddt)
+ if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
continue;
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
@@ -2072,6 +2072,32 @@ dump_all_ddts(spa_t *spa)
}
dump_dedup_ratio(&dds_total);
+
+ /*
+ * Dump a histogram of unique class entry age
+ */
+ if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
+ ddt_age_histo_t histogram;
+
+ (void) printf("DDT walk unique, building age histogram...\n");
+ ddt_prune_walk(spa, 0, &histogram);
+
+ /*
+ * print out histogram for unique entry class birth
+ */
+ if (histogram.dah_entries > 0) {
+ (void) printf("%5s %9s %4s\n",
+ "age", "blocks", "amnt");
+ (void) printf("%5s %9s %4s\n",
+ "-----", "---------", "----");
+ for (int i = 0; i < HIST_BINS; i++) {
+ (void) printf("%5d %9d %4d%%\n", 1 << i,
+ (int)histogram.dah_age_histo[i],
+ (int)((histogram.dah_age_histo[i] * 100) /
+ histogram.dah_entries));
+ }
+ }
+ }
}
static void
@@ -5749,12 +5775,17 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
ddt_entry_t *dde = ddt_lookup(ddt, bp);
/*
- * ddt_lookup() can only return NULL if this block didn't exist
+ * ddt_lookup() can return NULL if this block didn't exist
* in the DDT and creating it would take the DDT over its
* quota. Since we got the block from disk, it must exist in
- * the DDT, so this can't happen.
+ * the DDT, so this can't happen. However, when unique entries
+ * are pruned, the dedup bit can be set with no corresponding
+ * entry in the DDT.
*/
- VERIFY3P(dde, !=, NULL);
+ if (dde == NULL) {
+ ddt_exit(ddt);
+ goto skipped;
+ }
/* Get the phys for this variant */
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
@@ -5774,8 +5805,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
/* Consume a reference for this block. */
- VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
- ddt_phys_decref(dde->dde_phys, v);
+ if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
+ ddt_phys_decref(dde->dde_phys, v);
/*
* If this entry has a single flat phys, it may have been
@@ -5864,6 +5895,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
}
}
+skipped:
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -8138,7 +8170,7 @@ dump_mos_leaks(spa_t *spa)
for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
- if (!ddt)
+ if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
continue;
/* DDT store objects */
@@ -8150,11 +8182,14 @@ dump_mos_leaks(spa_t *spa)
}
/* FDT container */
- mos_obj_refd(ddt->ddt_dir_object);
+ if (ddt->ddt_version == DDT_VERSION_FDT)
+ mos_obj_refd(ddt->ddt_dir_object);
/* FDT log objects */
- mos_obj_refd(ddt->ddt_log[0].ddl_object);
- mos_obj_refd(ddt->ddt_log[1].ddl_object);
+ if (ddt->ddt_flags & DDT_FLAG_LOG) {
+ mos_obj_refd(ddt->ddt_log[0].ddl_object);
+ mos_obj_refd(ddt->ddt_log[1].ddl_object);
+ }
}
if (spa->spa_brt != NULL) {
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 9cd26a865..ce859226c 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -130,6 +130,8 @@ static int zpool_do_version(int, char **);
static int zpool_do_wait(int, char **);
+static int zpool_do_ddt_prune(int, char **);
+
static int zpool_do_help(int argc, char **argv);
static zpool_compat_status_t zpool_do_load_compat(
@@ -170,6 +172,7 @@ typedef enum {
HELP_CLEAR,
HELP_CREATE,
HELP_CHECKPOINT,
+ HELP_DDT_PRUNE,
HELP_DESTROY,
HELP_DETACH,
HELP_EXPORT,
@@ -426,6 +429,8 @@ static zpool_command_t command_table[] = {
{ "sync", zpool_do_sync, HELP_SYNC },
{ NULL },
{ "wait", zpool_do_wait, HELP_WAIT },
+ { NULL },
+ { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE },
};
#define NCOMMAND (ARRAY_SIZE(command_table))
@@ -545,6 +550,8 @@ get_usage(zpool_help_t idx)
case HELP_WAIT:
return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
"<pool> [interval]\n"));
+ case HELP_DDT_PRUNE:
+ return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
default:
__builtin_unreachable();
}
@@ -13342,6 +13349,88 @@ found:;
return (error);
}
+/*
+ * zpool ddtprune -d|-p <amount> <pool>
+ *
+ * -d <days> Prune entries <days> old and older
+ * -p <percent> Prune <percent> amount of entries
+ *
+ * Prune single reference entries from DDT to satisfy the amount specified.
+ */
+int
+zpool_do_ddt_prune(int argc, char **argv)
+{
+ zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
+ uint64_t amount = 0;
+ zpool_handle_t *zhp;
+ char *endptr;
+ int c;
+
+ while ((c = getopt(argc, argv, "d:p:")) != -1) {
+ switch (c) {
+ case 'd':
+ if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
+ (void) fprintf(stderr, gettext("-d cannot be "
+ "combined with -p option\n"));
+ usage(B_FALSE);
+ }
+ errno = 0;
+ amount = strtoull(optarg, &endptr, 0);
+ if (errno != 0 || *endptr != '\0' || amount == 0) {
+ (void) fprintf(stderr,
+ gettext("invalid days value\n"));
+ usage(B_FALSE);
+ }
+ amount *= 86400; /* convert days to seconds */
+ unit = ZPOOL_DDT_PRUNE_AGE;
+ break;
+ case 'p':
+ if (unit == ZPOOL_DDT_PRUNE_AGE) {
+ (void) fprintf(stderr, gettext("-p cannot be "
+ "combined with -d option\n"));
+ usage(B_FALSE);
+ }
+ errno = 0;
+ amount = strtoull(optarg, &endptr, 0);
+ if (errno != 0 || *endptr != '\0' ||
+ amount == 0 || amount > 100) {
+ (void) fprintf(stderr,
+ gettext("invalid percentage value\n"));
+ usage(B_FALSE);
+ }
+ unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (unit == ZPOOL_DDT_PRUNE_NONE) {
+ (void) fprintf(stderr,
+ gettext("missing amount option (-d|-p <value>)\n"));
+ usage(B_FALSE);
+ } else if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ } else if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ zhp = zpool_open(g_zfs, argv[0]);
+ if (zhp == NULL)
+ return (-1);
+
+ int error = zpool_ddt_prune(zhp, unit, amount);
+
+ zpool_close(zhp);
+
+ return (error);
+}
+
static int
find_command_idx(const char *command, int *idx)
{
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 7c9db84d4..a7843d338 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
extern uint64_t raidz_expand_max_reflow_bytes;
extern uint_t raidz_expand_pause_point;
+extern boolean_t ddt_prune_artificial_age;
+extern boolean_t ddt_dump_prune_histogram;
static ztest_shared_opts_t *ztest_shared_opts;
@@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
ztest_func_t ztest_pool_prefetch_ddt;
+ztest_func_t ztest_ddt_prune;
static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
+ ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -7288,6 +7292,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
mutex_exit(&ztest_vdev_lock);
}
+void
+ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
+{
+ (void) zd, (void) id;
+
+ spa_t *spa = ztest_spa;
+ uint64_t pct = ztest_random(15) + 1;
+
+ (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
+}
+
/*
* Verify pool integrity by running zdb.
*/
@@ -7469,6 +7484,13 @@ ztest_resume_thread(void *arg)
{
spa_t *spa = arg;
+ /*
+ * Synthesize aged DDT entries for ddt prune testing
+ */
+ ddt_prune_artificial_age = B_TRUE;
+ if (ztest_opts.zo_verbose >= 3)
+ ddt_dump_prune_histogram = B_TRUE;
+
while (!ztest_exiting) {
if (spa_suspended(spa))
ztest_resume(spa);
@@ -8587,6 +8609,12 @@ ztest_init(ztest_shared_t *zs)
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
continue;
+ /*
+ * split 50/50 between legacy and fast dedup
+ */
+ if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
+ continue;
+
VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
spa_feature_table[i].fi_uname));
fnvlist_add_uint64(props, buf, 0);
diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install
index 10083351a..d51e4ef00 100644
--- a/contrib/debian/openzfs-zfsutils.install
+++ b/contrib/debian/openzfs-zfsutils.install
@@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
usr/share/man/man8/zpool-create.8
usr/share/man/man8/zpool-destroy.8
usr/share/man/man8/zpool-detach.8
+usr/share/man/man8/zpool-ddtprune.8
usr/share/man/man8/zpool-events.8
usr/share/man/man8/zpool-export.8
usr/share/man/man8/zpool-get.8
diff --git a/include/libzfs.h b/include/libzfs.h
index 241279754..01d51999f 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -305,6 +305,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
+_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
+ uint64_t);
+
_LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
_LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 206e5e5c2..b1d74fbbc 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
+_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
+ uint64_t);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 93abad85a..4e5ccd463 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -405,6 +405,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
+extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
+ uint64_t amount);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 6f11cd90c..4d3c0cae0 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -35,8 +35,11 @@ extern "C" {
#endif
/* DDT version numbers */
-#define DDT_VERSION_LEGACY (0)
-#define DDT_VERSION_FDT (1)
+#define DDT_VERSION_LEGACY (0)
+#define DDT_VERSION_FDT (1)
+
+/* Dummy version to signal that configure is still necessary */
+#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
/* Names of interesting objects in the DDT root dir */
#define DDT_DIR_VERSION "version"
@@ -187,8 +190,11 @@ extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
ddt_lightweight_entry_t *ddlwe);
-extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
- const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
+ ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
+ const ddt_key_t *ddk);
extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
dmu_tx_t *tx);
@@ -212,6 +218,44 @@ extern void ddt_log_fini(void);
*/
/*
+ * We use a histogram to convert a percentage request into a
+ * cutoff value where entries older than the cutoff get pruned.
+ *
+ * The histogram bins represent hours in power-of-two increments.
+ * 16 bins covers up to four years.
+ */
+#define HIST_BINS 16
+
+typedef struct ddt_age_histo {
+ uint64_t dah_entries;
+ uint64_t dah_age_histo[HIST_BINS];
+} ddt_age_histo_t;
+
+void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);
+
+#if defined(_KERNEL) || !defined(ZFS_DEBUG)
+#define ddt_dump_age_histogram(histo, cutoff) ((void)0)
+#else
+static inline void
+ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
+{
+ if (histogram->dah_entries == 0)
+ return;
+
+ (void) printf("DDT prune unique class age, %llu hour cutoff\n",
+ (u_longlong_t)(gethrestime_sec() - cutoff)/3600);
+ (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt");
+ (void) printf("%5s %9s %4s\n", "-----", "---------", "----");
+ for (int i = 0; i < HIST_BINS; i++) {
+ (void) printf("%5d %9llu %4d%%\n", 1<<i,
+ (u_longlong_t)histogram->dah_age_histo[i],
+ (int)((histogram->dah_age_histo[i] * 100) /
+ histogram->dah_entries));
+ }
+}
+#endif
+
+/*
* Enough room to expand DMU_POOL_DDT format for all possible DDT
* checksum/class/type combinations.
*/
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 73d686a00..fc4f22cd5 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1422,7 +1422,7 @@ typedef enum {
*/
typedef enum zfs_ioc {
/*
- * Core features - 88/128 numbers reserved.
+ * Core features - 89/128 numbers reserved.
*/
#ifdef __FreeBSD__
ZFS_IOC_FIRST = 0,
@@ -1519,6 +1519,7 @@ typedef enum zfs_ioc {
ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */
ZFS_IOC_POOL_SCRUB, /* 0x5a57 */
ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */
+ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */
/*
* Per-platform (Optional) - 8/128 numbers reserved.
@@ -1655,6 +1656,12 @@ typedef enum {
ZPOOL_PREFETCH_DDT
} zpool_prefetch_type_t;
+typedef enum {
+ ZPOOL_DDT_PRUNE_NONE,
+ ZPOOL_DDT_PRUNE_AGE, /* in seconds */
+ ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */
+} zpool_ddt_prune_unit_t;
+
/*
* Bookmark name values.
*/
@@ -1754,6 +1761,12 @@ typedef enum {
#define ZPOOL_PREFETCH_TYPE "prefetch_type"
/*
+ * The following are names used when invoking ZFS_IOC_DDT_PRUNE.
+ */
+#define DDT_PRUNE_UNIT "ddt_prune_unit"
+#define DDT_PRUNE_AMOUNT "ddt_prune_amount"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 4fc6f22fc..7811abbb9 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -412,6 +412,7 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
+ boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 87c5c4380..88dd8b3c6 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -183,8 +183,8 @@
<elf-symbol name='fsleep' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_dataset_depth' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_system_hostid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
- <elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_timestamp' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getextmntent' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getmntany' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getprop_uint64' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -466,7 +466,9 @@
<elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_default_search_paths' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_disable_datasets' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -485,8 +487,8 @@
<elf-symbol name='zpool_export_force' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_feature_init' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_config' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
- <elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_parent_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev_by_physpath' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_free_handles' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_get_all_vdev_props' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -529,7 +531,6 @@
<elf-symbol name='zpool_prefetch' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prepare_and_label_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prepare_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
- <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_align_right' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_column_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_default_numeric' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -5929,6 +5930,7 @@
<enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
<enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
<enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
+ <enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
<enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
<enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
<enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@@ -5963,6 +5965,13 @@
<enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
</enum-decl>
<typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
+ <enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
+ <underlying-type type-id='9cac1fee'/>
+ <enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
+ <enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
+ <enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
+ </enum-decl>
+ <typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
<enum-decl name='spa_feature' id='33ecb627'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='SPA_FEATURE_NONE' value='-1'/>
@@ -6139,6 +6148,12 @@
<parameter type-id='857bb57e'/>
<return type-id='95e97e5e'/>
</function-decl>
+ <function-decl name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64'>
+ <parameter type-id='80f4b756'/>
+ <parameter type-id='02e25ab0'/>
+ <parameter type-id='9c313c2d'/>
+ <return type-id='95e97e5e'/>
+ </function-decl>
<function-decl name='zfs_resolve_shortname' mangled-name='zfs_resolve_shortname' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_resolve_shortname'>
<parameter type-id='80f4b756'/>
<parameter type-id='26a90f95'/>
@@ -6798,6 +6813,12 @@
<parameter type-id='80f4b756' name='propval'/>
<return type-id='95e97e5e'/>
</function-decl>
+ <function-decl name='zpool_ddt_prune' mangled-name='zpool_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_ddt_prune'>
+ <parameter type-id='4c81de99' name='zhp'/>
+ <parameter type-id='02e25ab0' name='unit'/>
+ <parameter type-id='9c313c2d' name='amount'/>
+ <return type-id='95e97e5e'/>
+ </function-decl>
</abi-instr>
<abi-instr address-size='64' path='lib/libzfs/libzfs_sendrecv.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='8901473c' size-in-bits='576' id='f5da478b'>
@@ -7837,7 +7858,7 @@
</data-member>
</class-decl>
<typedef-decl name='vdev_cbdata_t' type-id='b8006be8' id='a9679c94'/>
- <class-decl name='zprop_get_cbdata' size-in-bits='832' is-struct='yes' visibility='default' id='f3d3c319'>
+ <class-decl name='zprop_get_cbdata' size-in-bits='960' is-struct='yes' visibility='default' id='f3d3c319'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='cb_sources' type-id='95e97e5e' visibility='default'/>
</data-member>
@@ -7856,6 +7877,9 @@
<data-member access='public' layout-offset-in-bits='448'>
<var-decl name='cb_first' type-id='c19b74c3' visibility='default'/>
</data-member>
+ <data-member access='public' layout-offset-in-bits='480'>
+ <var-decl name='cb_json' type-id='c19b74c3' visibility='default'/>
+ </data-member>
<data-member access='public' layout-offset-in-bits='512'>
<var-decl name='cb_proplist' type-id='3a9b2288' visibility='default'/>
</data-member>
@@ -7865,6 +7889,15 @@
<data-member access='public' layout-offset-in-bits='640'>
<var-decl name='cb_vdevs' type-id='a9679c94' visibility='default'/>
</data-member>
+ <data-member access='public' layout-offset-in-bits='832'>
+ <var-decl name='cb_jsobj' type-id='5ce45b60' visibility='default'/>
+ </data-member>
+ <data-member access='public' layout-offset-in-bits='896'>
+ <var-decl name='cb_json_as_int' type-id='c19b74c3' visibility='default'/>
+ </data-member>
+ <data-member access='public' layout-offset-in-bits='928'>
+ <var-decl name='cb_json_pool_key_guid' type-id='c19b74c3' visibility='default'/>
+ </data-member>
</class-decl>
<typedef-decl name='zprop_get_cbdata_t' type-id='f3d3c319' id='f3d87113'/>
<typedef-decl name='zprop_func' type-id='2e711a2a' id='1ec3747a'/>
@@ -7968,6 +8001,11 @@
<qualified-type-def type-id='d33f11cb' restrict='yes' id='5c53ba29'/>
<pointer-type-def type-id='ffa52b96' size-in-bits='64' id='76c8174b'/>
<pointer-type-def type-id='f3d87113' size-in-bits='64' id='0d2a0670'/>
+ <function-decl name='nvlist_print_json' visibility='default' binding='global' size-in-bits='64'>
+ <parameter type-id='822cd80b'/>
+ <parameter type-id='5ce45b60'/>
+ <return type-id='95e97e5e'/>
+ </function-decl>
<function-decl name='zpool_label_disk' mangled-name='zpool_label_disk' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_label_disk'>
<parameter type-id='b0382bb3'/>
<parameter type-id='4c81de99'/>
@@ -8075,6 +8113,11 @@
<parameter type-id='d33f11cb'/>
<return type-id='48b5725f'/>
</function-decl>
+ <function-decl name='putc' visibility='default' binding='global' size-in-bits='64'>
+ <parameter type-id='95e97e5e'/>
+ <parameter type-id='822cd80b'/>
+ <return type-id='95e97e5e'/>
+ </function-decl>
<function-decl name='puts' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='95e97e5e'/>
@@ -8093,6 +8136,11 @@
<parameter type-id='95e97e5e'/>
<return type-id='48b5725f'/>
</function-decl>
+ <function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
+ <parameter type-id='80f4b756'/>
+ <parameter type-id='80f4b756'/>
+ <return type-id='b59d7dce'/>
+ </function-decl>
<function-decl name='strnlen' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='b59d7dce'/>
@@ -8292,12 +8340,12 @@
<function-decl name='zfs_version_print' mangled-name='zfs_version_print' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_print'>
<return type-id='95e97e5e'/>
</function-decl>
- <function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
- <return type-id='95e97e5e'/>
- </function-decl>
<function-decl name='zfs_version_nvlist' mangled-name='zfs_version_nvlist' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_nvlist'>
<return type-id='5ce45b60'/>
</function-decl>
+ <function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
+ <return type-id='95e97e5e'/>
+ </function-decl>
<function-decl name='printf_color' mangled-name='printf_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='printf_color'>
<parameter type-id='80f4b756' name='color'/>
<parameter type-id='80f4b756' name='format'/>
@@ -8802,11 +8850,6 @@
<parameter type-id='78c01427'/>
<return type-id='13956559'/>
</function-decl>
- <function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
- <parameter type-id='80f4b756'/>
- <parameter type-id='80f4b756'/>
- <return type-id='b59d7dce'/>
- </function-decl>
<function-decl name='zfs_dirnamelen' mangled-name='zfs_dirnamelen' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_dirnamelen'>
<parameter type-id='80f4b756' name='path'/>
<return type-id='79a0948f'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index dfa7c4db6..14410b153 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5649,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
return (ret);
}
+
+/*
+ * Prune older entries from the DDT to reclaim space under the quota
+ */
+int
+zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit,
+ uint64_t amount)
+{
+ int error = lzc_ddt_prune(zhp->zpool_name, unit, amount);
+ if (error != 0) {
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+ char errbuf[ERRBUFLEN];
+
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot prune dedup table on '%s'"), zhp->zpool_name);
+
+ if (error == EALREADY) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "a prune operation is already in progress"));
+ (void) zfs_error(hdl, EZFS_BUSY, errbuf);
+ } else {
+ (void) zpool_standard_error(hdl, errno, errbuf);
+ }
+ return (-1);
+ }
+
+ return (0);
+}
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index 1062a6b52..5ee6b8e09 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -162,6 +162,7 @@
<elf-symbol name='lzc_channel_program_nosync' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_clone' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='lzc_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy_bookmarks' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy_snaps' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -1444,6 +1445,7 @@
<enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
<enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
<enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
+ <enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
<enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
<enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
<enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@@ -1484,6 +1486,13 @@
<enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
</enum-decl>
<typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
+ <enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
+ <underlying-type type-id='9cac1fee'/>
+ <enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
+ <enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
+ <enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
+ </enum-decl>
+ <typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
<enum-decl name='data_type_t' naming-typedef-id='8d0687d2' id='aeeae136'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='DATA_TYPE_DONTCARE' value='-1'/>
@@ -3015,6 +3024,12 @@
<parameter type-id='857bb57e' name='outnvl'/>
<return type-id='95e97e5e'/>
</function-decl>
+ <function-decl name='lzc_ddt_prune' mangled-name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='lzc_ddt_prune'>
+ <parameter type-id='80f4b756' name='pool'/>
+ <parameter type-id='02e25ab0' name='unit'/>
+ <parameter type-id='9c313c2d' name='amount'/>
+ <return type-id='95e97e5e'/>
+ </function-decl>
<function-type size-in-bits='64' id='c70fa2e8'>
<parameter type-id='95e97e5e'/>
<parameter type-id='eaa32e2f'/>
diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c
index ec8b0ff4f..d07fca6ce 100644
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl)
{
return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl));
}
+
+/*
+ * Prune the specified amount from the pool's dedup table.
+ */
+int
+lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount)
+{
+ int error;
+
+ nvlist_t *result = NULL;
+ nvlist_t *args = fnvlist_alloc();
+
+ fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit);
+ fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount);
+
+ error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result);
+
+ fnvlist_free(args);
+ fnvlist_free(result);
+
+ return (error);
+}
diff --git a/man/Makefile.am b/man/Makefile.am
index 194bb4721..fde704933 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -72,6 +72,7 @@ dist_man_MANS = \
%D%/man8/zpool-create.8 \
%D%/man8/zpool-destroy.8 \
%D%/man8/zpool-detach.8 \
+ %D%/man8/zpool-ddtprune.8 \
%D%/man8/zpool-events.8 \
%D%/man8/zpool-export.8 \
%D%/man8/zpool-get.8 \
diff --git a/man/man8/zpool-ddtprune.8 b/man/man8/zpool-ddtprune.8
new file mode 100644
index 000000000..1ab7d3982
--- /dev/null
+++ b/man/man8/zpool-ddtprune.8
@@ -0,0 +1,48 @@
+.\"
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\"
+.\"
+.\" Copyright (c) 2024, Klara Inc.
+.\"
+.Dd June 17, 2024
+.Dt ZPOOL-DDTPRUNE 8
+.Os
+.
+.Sh NAME
+.Nm zpool-ddtprune
+.Nd Prunes the oldest entries from the single reference dedup table(s)
+.Sh SYNOPSIS
+.Nm zpool
+.Cm ddtprune
+.Fl d Ar days | Fl p Ar percentage
+.Ar pool
+.Sh DESCRIPTION
+This command prunes older unique entries from the dedup table.
+As a complement to the dedup quota feature,
+.Sy ddtprune
+allows removal of older non-duplicate entries to make room for
+newer duplicate entries.
+.Pp
+The amount to prune can be based on a target percentage of the unique entries
+or based on the age (i.e., every unique entry older than N days).
+.
+.Sh SEE ALSO
+.Xr zdb 8 ,
+.Xr zpool-status 8
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index c55644d9e..02a258f66 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -592,6 +592,7 @@ don't wait.
.Xr zpool-checkpoint 8 ,
.Xr zpool-clear 8 ,
.Xr zpool-create 8 ,
+.Xr zpool-ddtprune 8 ,
.Xr zpool-destroy 8 ,
.Xr zpool-detach 8 ,
.Xr zpool-events 8 ,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 11fd10fb7..0e12e7e49 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -125,6 +125,13 @@
* without which, no space would be recovered and the DDT would continue to be
* considered "over quota". See zap_shrink_enabled.
*
+ * ## Dedup table pruning
+ *
+ * As a complement to the dedup quota feature, ddtprune allows removal of older
+ * non-duplicate entries to make room for newer duplicate entries. The amount
+ * to prune can be based on a target percentage of the unique entries or based
+ * on the age (i.e., prune unique entry older than N days).
+ *
* ## Dedup log
*
* Historically, all entries modified on a txg were written back to dedup
@@ -228,6 +235,19 @@ int zfs_dedup_prefetch = 0;
*/
uint_t dedup_class_wait_txgs = 5;
+/*
+ * How many DDT prune entries to add to the DDT sync AVL tree.
+ * Note these addtional entries have a memory footprint of a
+ * ddt_entry_t (216 bytes).
+ */
+static uint32_t zfs_ddt_prunes_per_txg = 50000;
+
+/*
+ * For testing, synthesize aged DDT entries
+ * (in global scope for ztest)
+ */
+boolean_t ddt_prune_artificial_age = B_FALSE;
+boolean_t ddt_dump_prune_histogram = B_FALSE;
/*
* Don't do more than this many incremental flush passes per txg.
@@ -268,10 +288,6 @@ static const uint64_t ddt_version_flags[] = {
[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
};
-/* Dummy version to signal that configure is still necessary */
-#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
-
-#ifdef _KERNEL
/* per-DDT kstats */
typedef struct {
/* total lookups and whether they returned new or existing entries */
@@ -324,6 +340,7 @@ static const ddt_kstats_t ddt_kstats_template = {
{ "log_flush_time_rate", KSTAT_DATA_UINT32 },
};
+#ifdef _KERNEL
#define _DDT_KSTAT_STAT(ddt, stat) \
&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
#define DDT_KSTAT_BUMP(ddt, stat) \
@@ -343,6 +360,7 @@ static const ddt_kstats_t ddt_kstats_template = {
#define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
#endif /* _KERNEL */
+
static void
ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
@@ -715,6 +733,30 @@ ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
}
+static uint64_t
+ddt_class_start(void)
+{
+ uint64_t start = gethrestime_sec();
+
+ if (ddt_prune_artificial_age) {
+ /*
+ * debug aide -- simulate a wider distribution
+ * so we don't have to wait for an aged DDT
+ * to test prune.
+ */
+ int range = 1 << 21;
+ int percent = random_in_range(100);
+ if (percent < 50) {
+ range = range >> 4;
+ } else if (percent > 75) {
+ range /= 2;
+ }
+ start -= random_in_range(range);
+ }
+
+ return (start);
+}
+
void
ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
@@ -1022,6 +1064,47 @@ ddt_prefetch_all(spa_t *spa)
static int ddt_configure(ddt_t *ddt, boolean_t new);
+/*
+ * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
+ * to the ones in the entry. If they're different, then the passed-in BP is
+ * from a previous generation of this entry (ie was previously pruned) and we
+ * have to act like the entry doesn't exist at all.
+ *
+ * This should only happen during a lookup to free the block (zio_ddt_free()).
+ *
+ * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
+ * -- robn, 2024-02-09
+ */
+static boolean_t
+ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
+{
+ /* If the BP has no DVAs, then this entry is good */
+ uint_t ndvas = BP_GET_NDVAS(bp);
+ if (ndvas == 0)
+ return (B_TRUE);
+
+ /*
+ * Only checking the phys for the copies. For flat, there's only one;
+ * for trad it'll be the one that has the matching set of DVAs.
+ */
+ const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+ dde->dde_phys->ddp_flat.ddp_dva :
+ dde->dde_phys->ddp_trad[ndvas].ddp_dva;
+
+ /*
+ * Compare entry DVAs with the BP. They should all be there, but
+ * there's not really anything we can do if its only partial anyway,
+ * that's an error somewhere else, maybe long ago.
+ */
+ uint_t d;
+ for (d = 0; d < ndvas; d++)
+ if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
+ return (B_FALSE);
+ ASSERT3U(d, ==, ndvas);
+
+ return (B_TRUE);
+}
+
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
{
@@ -1057,8 +1140,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
/* If it's already loaded, we can just return it. */
DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
- if (dde->dde_flags & DDE_FLAG_LOADED)
- return (dde);
+ if (dde->dde_flags & DDE_FLAG_LOADED) {
+ if (ddt_entry_lookup_is_valid(ddt, bp, dde))
+ return (dde);
+ return (NULL);
+ }
/* Someone else is loading it, wait for it. */
dde->dde_waiters++;
@@ -1077,7 +1163,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
}
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
- return (dde);
+
+ /* Make sure the loaded entry matches the BP */
+ if (ddt_entry_lookup_is_valid(ddt, bp, dde))
+ return (dde);
+ return (NULL);
} else
DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
@@ -1086,32 +1176,42 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
/* Record the time this class was created (used by ddt prune) */
if (ddt->ddt_flags & DDT_FLAG_FLAT)
- dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
+ dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
avl_insert(&ddt->ddt_tree, dde, where);
/* If its in the log tree, we can "load" it from there */
if (ddt->ddt_flags & DDT_FLAG_LOG) {
ddt_lightweight_entry_t ddlwe;
- boolean_t found = B_FALSE;
-
- if (ddt_log_take_key(ddt, ddt->ddt_log_active,
- &search, &ddlwe)) {
- DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
- found = B_TRUE;
- } else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing,
- &search, &ddlwe)) {
- DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit);
- found = B_TRUE;
- }
-
- if (found) {
- dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
+ if (ddt_log_find_key(ddt, &search, &ddlwe)) {
+ /*
+ * See if we have the key first, and if so, set up
+ * the entry.
+ */
dde->dde_type = ddlwe.ddlwe_type;
dde->dde_class = ddlwe.ddlwe_class;
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
DDT_PHYS_SIZE(ddt));
+ /* Whatever we found isn't valid for this BP, eject */
+ if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(ddt, dde);
+ return (NULL);
+ }
+
+ /* Remove it and count it */
+ if (ddt_log_remove_key(ddt,
+ ddt->ddt_log_active, &search)) {
+ DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
+ } else {
+ VERIFY(ddt_log_remove_key(ddt,
+ ddt->ddt_log_flushing, &search));
+ DDT_KSTAT_BUMP(ddt,
+ dds_lookup_log_flushing_hit);
+ }
+
+ dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
@@ -1150,6 +1250,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ boolean_t valid = B_TRUE;
+
if (dde->dde_type == DDT_TYPES &&
dde->dde_class == DDT_CLASSES &&
ddt_over_quota(spa)) {
@@ -1163,6 +1265,24 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
/* Flag cleanup required */
dde->dde_flags |= DDE_FLAG_OVERQUOTA;
} else if (error == 0) {
+ /*
+ * If what we loaded is no good for this BP and there's no one
+ * waiting for it, we can just remove it and get out. If its no
+ * good but there are waiters, we have to leave it, because we
+ * don't know what they want. If its not needed we'll end up
+ * taking an entry log/sync, but it can only happen if more
+ * than one previous version of this block is being deleted at
+ * the same time. This is extremely unlikely to happen and not
+ * worth the effort to deal with without taking an entry
+ * update.
+ */
+ valid = ddt_entry_lookup_is_valid(ddt, bp, dde);
+ if (!valid && dde->dde_waiters == 0) {
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(ddt, dde);
+ return (NULL);
+ }
+
DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
@@ -1191,7 +1311,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
dde->dde_flags |= DDE_FLAG_LOADED;
cv_broadcast(&dde->dde_cv);
- return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde);
+ if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
+ return (NULL);
+
+ return (dde);
}
void
@@ -1420,7 +1543,6 @@ not_found:
static void
ddt_table_alloc_kstats(ddt_t *ddt)
{
-#ifdef _KERNEL
char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
char *name = kmem_asprintf("ddt_stats_%s",
zio_checksum_table[ddt->ddt_checksum].ci_name);
@@ -1436,9 +1558,6 @@ ddt_table_alloc_kstats(ddt_t *ddt)
kmem_strfree(name);
kmem_strfree(mod);
-#else
- (void) ddt;
-#endif /* _KERNEL */
}
static ddt_t *
@@ -1468,13 +1587,11 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
static void
ddt_table_free(ddt_t *ddt)
{
-#ifdef _KERNEL
if (ddt->ddt_ksp != NULL) {
kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
ddt->ddt_ksp->ks_data = NULL;
kstat_delete(ddt->ddt_ksp);
}
-#endif /* _KERNEL */
ddt_log_free(ddt);
ASSERT0(avl_numnodes(&ddt->ddt_tree));
@@ -1814,7 +1931,7 @@ ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
if (ddt_phys_birth(ddp, v) == 0) {
- ASSERT3U(phys_refcnt, ==, 0);
+ ASSERT0(phys_refcnt);
continue;
}
if (DDT_PHYS_IS_DITTO(ddt, p)) {
@@ -2288,8 +2405,9 @@ ddt_walk_ready(spa_t *spa)
return (B_TRUE);
}
-int
-ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
+static int
+ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
+ uint64_t flags, boolean_t wait)
{
do {
do {
@@ -2298,7 +2416,11 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
if (ddt == NULL)
continue;
- if (ddt->ddt_flush_force_txg > 0)
+ if (flags != 0 &&
+ (ddt->ddt_flags & flags) != flags)
+ continue;
+
+ if (wait && ddt->ddt_flush_force_txg > 0)
return (EAGAIN);
int error = ENOENT;
@@ -2322,13 +2444,19 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
return (SET_ERROR(ENOENT));
}
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
+{
+ return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
+}
+
/*
* This function is used by Block Cloning (brt.c) to increase reference
* counter for the DDT entry if the block is already in DDT.
*
* Return false if the block, despite having the D bit set, is not present
- * in the DDT. Currently this is not possible but might be in the future.
- * See the comment below.
+ * in the DDT. This is possible when the DDT has been pruned by an admin
+ * or by the DDT quota mechanism.
*/
boolean_t
ddt_addref(spa_t *spa, const blkptr_t *bp)
@@ -2359,28 +2487,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
- /*
- * This entry already existed (dde_type is real), so it must
- * have refcnt >0 at the start of this txg. We are called from
- * brt_pending_apply(), before frees are issued, so the refcnt
- * can't be lowered yet. Therefore, it must be >0. We assert
- * this because if the order of BRT and DDT interactions were
- * ever to change and the refcnt was ever zero here, then
- * likely further action is required to fill out the DDT entry,
- * and this is a place that is likely to be missed in testing.
- */
- ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
-
ddt_phys_addref(dde->dde_phys, v);
result = B_TRUE;
} else {
/*
- * At the time of implementating this if the block has the
- * DEDUP flag set it must exist in the DEDUP table, but
- * there are many advocates that want ability to remove
- * entries from DDT with refcnt=1. If this will happen,
- * we may have a block with the DEDUP set, but which doesn't
- * have a corresponding entry in the DDT. Be ready.
+ * If the block has the DEDUP flag set it still might not
+ * exist in the DEDUP table due to DDT pruning of entries
+ * where refcnt=1.
*/
ddt_remove(ddt, dde);
result = B_FALSE;
@@ -2392,6 +2505,261 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
return (result);
}
+typedef struct ddt_prune_entry {
+ ddt_t *dpe_ddt;
+ ddt_key_t dpe_key;
+ list_node_t dpe_node;
+ ddt_univ_phys_t dpe_phys[];
+} ddt_prune_entry_t;
+
+typedef struct ddt_prune_info {
+ spa_t *dpi_spa;
+ uint64_t dpi_txg_syncs;
+ uint64_t dpi_pruned;
+ list_t dpi_candidates;
+} ddt_prune_info_t;
+
+/*
+ * Add prune candidates for ddt_sync during spa_sync
+ */
+static void
+prune_candidates_sync(void *arg, dmu_tx_t *tx)
+{
+ (void) tx;
+ ddt_prune_info_t *dpi = arg;
+ ddt_prune_entry_t *dpe;
+
+ spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
+
+ /* Process the prune candidates collected so far */
+ while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
+ blkptr_t blk;
+ ddt_t *ddt = dpe->dpe_ddt;
+
+ ddt_enter(ddt);
+
+ /*
+ * If it's on the live list, then it was loaded for update
+ * this txg and is no longer stale; skip it.
+ */
+ if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
+ ddt_exit(ddt);
+ kmem_free(dpe, sizeof (*dpe));
+ continue;
+ }
+
+ ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
+ dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
+
+ ddt_entry_t *dde = ddt_lookup(ddt, &blk);
+ if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
+ ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+ /*
+ * Zero the physical, so we don't try to free DVAs
+ * at flush nor try to reuse this entry.
+ */
+ ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
+
+ dpi->dpi_pruned++;
+ }
+
+ ddt_exit(ddt);
+ kmem_free(dpe, sizeof (*dpe));
+ }
+
+ spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
+ dpi->dpi_txg_syncs++;
+}
+
+/*
+ * Prune candidates are collected in open context and processed
+ * in sync context as part of ddt_sync_table().
+ */
+static void
+ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
+ const ddt_univ_phys_t *ddp)
+{
+ ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
+
+ size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
+ ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
+
+ dpe->dpe_ddt = ddt;
+ dpe->dpe_key = *ddk;
+ memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
+ list_insert_head(list, dpe);
+}
+
+/*
+ * Interate over all the entries in the DDT unique class.
+ * The walk will perform one of the following operations:
+ * (a) build a histogram than can be used when pruning
+ * (b) prune entries older than the cutoff
+ *
+ * Also called by zdb(8) to dump the age histogram
+ */
+void
+ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
+{
+ ddt_bookmark_t ddb = {
+ .ddb_class = DDT_CLASS_UNIQUE,
+ .ddb_type = 0,
+ .ddb_checksum = 0,
+ .ddb_cursor = 0
+ };
+ ddt_lightweight_entry_t ddlwe = {0};
+ int error;
+ int total = 0, valid = 0;
+ int candidates = 0;
+ uint64_t now = gethrestime_sec();
+ ddt_prune_info_t dpi;
+ boolean_t pruning = (cutoff != 0);
+
+ if (pruning) {
+ dpi.dpi_txg_syncs = 0;
+ dpi.dpi_pruned = 0;
+ dpi.dpi_spa = spa;
+ list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
+ offsetof(ddt_prune_entry_t, dpe_node));
+ }
+
+ if (histogram != NULL)
+ memset(histogram, 0, sizeof (ddt_age_histo_t));
+
+ while ((error =
+ ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ VERIFY(ddt);
+
+ if (spa_shutting_down(spa) || issig())
+ break;
+ total++;
+
+ ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
+ ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
+
+ uint64_t class_start =
+ ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
+
+ /*
+ * If this entry is on the log, then the stored entry is stale
+ * and we should skip it.
+ */
+ if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
+ continue;
+
+ /* prune older entries */
+ if (pruning && class_start < cutoff) {
+ if (candidates++ >= zfs_ddt_prunes_per_txg) {
+ /* sync prune candidates in batches */
+ VERIFY0(dsl_sync_task(spa_name(spa),
+ NULL, prune_candidates_sync,
+ &dpi, 0, ZFS_SPACE_CHECK_NONE));
+ candidates = 1;
+ }
+ ddt_prune_entry(&dpi.dpi_candidates, ddt,
+ &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
+ }
+
+ /* build a histogram */
+ if (histogram != NULL) {
+ uint64_t age = MAX(1, (now - class_start) / 3600);
+ int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
+ histogram->dah_entries++;
+ histogram->dah_age_histo[bin]++;
+ }
+
+ valid++;
+ }
+
+ if (pruning && valid > 0) {
+ if (!list_is_empty(&dpi.dpi_candidates)) {
+ /* sync out final batch of prune candidates */
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ prune_candidates_sync, &dpi, 0,
+ ZFS_SPACE_CHECK_NONE));
+ }
+ list_destroy(&dpi.dpi_candidates);
+
+ zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
+ (u_longlong_t)dpi.dpi_pruned,
+ (int)((dpi.dpi_pruned * 100) / valid),
+ (u_longlong_t)dpi.dpi_txg_syncs);
+ }
+}
+
+static uint64_t
+ddt_total_entries(spa_t *spa)
+{
+ ddt_object_t ddo;
+ ddt_get_dedup_object_stats(spa, &ddo);
+
+ return (ddo.ddo_count);
+}
+
+int
+ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
+ uint64_t amount)
+{
+ uint64_t cutoff;
+ uint64_t start_time = gethrtime();
+
+ if (spa->spa_active_ddt_prune)
+ return (SET_ERROR(EALREADY));
+ if (ddt_total_entries(spa) == 0)
+ return (0);
+
+ spa->spa_active_ddt_prune = B_TRUE;
+
+ zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
+ unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
+
+ if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
+ ddt_age_histo_t histogram;
+ uint64_t oldest = 0;
+
+ /* Make a pass over DDT to build a histogram */
+ ddt_prune_walk(spa, 0, &histogram);
+
+ int target = (histogram.dah_entries * amount) / 100;
+
+ /*
+ * Figure out our cutoff date
+ * (i.e., which bins to prune from)
+ */
+ for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
+ if (histogram.dah_age_histo[i] != 0) {
+ /* less than this bucket remaining */
+ if (target < histogram.dah_age_histo[i]) {
+ oldest = MAX(1, (1<<i) * 3600);
+ target = 0;
+ } else {
+ target -= histogram.dah_age_histo[i];
+ }
+ }
+ }
+ cutoff = gethrestime_sec() - oldest;
+
+ if (ddt_dump_prune_histogram)
+ ddt_dump_age_histogram(&histogram, cutoff);
+ } else if (unit == ZPOOL_DDT_PRUNE_AGE) {
+ cutoff = gethrestime_sec() - amount;
+ } else {
+ return (EINVAL);
+ }
+
+ if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
+ /* Traverse DDT to prune entries older that our cuttoff */
+ ddt_prune_walk(spa, cutoff, NULL);
+ }
+
+ zfs_dbgmsg("%s: prune completed in %llu ms",
+ spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
+
+ spa->spa_active_ddt_prune = B_FALSE;
+ return (0);
+}
+
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
"Enable prefetching dedup-ed blks");
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
index a367d0cd0..3aa07dc25 100644
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@@ -353,16 +353,15 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
}
boolean_t
-ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
- ddt_lightweight_entry_t *ddlwe)
+ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
{
ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
if (ddle == NULL)
return (B_FALSE);
- DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
-
- ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+ ddt_lightweight_entry_t ddlwe;
+ DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+ ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
@@ -371,6 +370,21 @@ ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
return (B_TRUE);
}
+boolean_t
+ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
+ ddt_lightweight_entry_t *ddlwe)
+{
+ ddt_log_entry_t *ddle =
+ avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
+ if (!ddle)
+ ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
+ if (!ddle)
+ return (B_FALSE);
+ if (ddlwe)
+ DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+ return (B_TRUE);
+}
+
void
ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 7ce2d9196..55bf9b683 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4342,6 +4342,51 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
return (total_errors > 0 ? SET_ERROR(EINVAL) : 0);
}
+#define DDT_PRUNE_UNIT "ddt_prune_unit"
+#define DDT_PRUNE_AMOUNT "ddt_prune_amount"
+
+/*
+ * innvl: {
+ * "ddt_prune_unit" -> uint32_t
+ * "ddt_prune_amount" -> uint64_t
+ * }
+ *
+ * outnvl: "waited" -> boolean_t
+ */
+static const zfs_ioc_key_t zfs_keys_ddt_prune[] = {
+ {DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0},
+ {DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0},
+};
+
+static int
+zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int32_t unit;
+ uint64_t amount;
+
+ if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 ||
+ nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) {
+ return (EINVAL);
+ }
+
+ spa_t *spa;
+ int error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit,
+ amount);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
/*
* This ioctl waits for activity of a particular type to complete. If there is
* no activity of that type in progress, it returns immediately, and the
@@ -7430,6 +7475,11 @@ zfs_ioctl_init(void)
POOL_CHECK_NONE, B_FALSE, B_FALSE,
zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props));
+ zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE,
+ zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune));
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a841e0a79..e4ccd144f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3859,6 +3859,16 @@ zio_ddt_free(zio_t *zio)
}
ddt_exit(ddt);
+ /*
+ * When no entry was found, it must have been pruned,
+ * so we can free it now instead of decrementing the
+ * refcount in the DDT.
+ */
+ if (!dde) {
+ BP_SET_DEDUP(bp, 0);
+ zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
+ }
+
return (zio);
}