summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2019-03-29 09:13:20 -0700
committerGitHub <[email protected]>2019-03-29 09:13:20 -0700
commit1b939560be5c51deecf875af9dada9d094633bf7 (patch)
tree2a780b838134636ddbc65f89d227e37c74abe17b /include
parentf94b3cbf43d62f4962e71cfe7ba8c6f0602e2a45 (diff)
Add TRIM support
UNMAP/TRIM support is a frequently-requested feature to help prevent performance from degrading on SSDs and on various other SAN-like storage back-ends. By issuing UNMAP/TRIM commands for sectors which are no longer allocated the underlying device can often more efficiently manage itself. This TRIM implementation is modeled on the `zpool initialize` feature which writes a pattern to all unallocated space in the pool. The new `zpool trim` command uses the same vdev_xlate() code to calculate what sectors are unallocated, the same per- vdev TRIM thread model and locking, and the same basic CLI for a consistent user experience. The core difference is that instead of writing a pattern it will issue UNMAP/TRIM commands for those extents. The zio pipeline was updated to accommodate this by adding a new ZIO_TYPE_TRIM type and associated spa taskq. This new type makes is straight forward to add the platform specific TRIM/UNMAP calls to vdev_disk.c and vdev_file.c. These new ZIO_TYPE_TRIM zios are handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs. This makes it possible to largely avoid changing the pipieline, one exception is that TRIM zio's may exceed the 16M block size limit since they contain no data. In addition to the manual `zpool trim` command, a background automatic TRIM was added and is controlled by the 'autotrim' property. It relies on the exact same infrastructure as the manual TRIM. However, instead of relying on the extents in a metaslab's ms_allocatable range tree, a ms_trim tree is kept per metaslab. When 'autotrim=on', ranges added back to the ms_allocatable tree are also added to the ms_free tree. The ms_free tree is then periodically consumed by an autotrim thread which systematically walks a top level vdev's metaslabs. Since the automatic TRIM will skip ranges it considers too small there is value in occasionally running a full `zpool trim`. This may occur when the freed blocks are small and not enough time was allowed to aggregate them. An automatic TRIM and a manual `zpool trim` may be run concurrently, in which case the automatic TRIM will yield to the manual TRIM. Reviewed-by: Jorgen Lundman <[email protected]> Reviewed-by: Tim Chase <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Contributions-by: Saso Kiselkov <[email protected]> Contributions-by: Tim Chase <[email protected]> Contributions-by: Chunwei Chen <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #8419 Closes #598
Diffstat (limited to 'include')
-rw-r--r--include/libzfs.h17
-rw-r--r--include/libzfs_core.h2
-rw-r--r--include/linux/blkdev_compat.h30
-rw-r--r--include/spl/sys/Makefile.am1
-rw-r--r--include/spl/sys/dkioc_free_util.h58
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/fs/zfs.h69
-rw-r--r--include/sys/metaslab.h2
-rw-r--r--include/sys/metaslab_impl.h25
-rw-r--r--include/sys/spa.h62
-rw-r--r--include/sys/spa_impl.h1
-rw-r--r--include/sys/sysevent/eventdefs.h5
-rw-r--r--include/sys/txg.h7
-rw-r--r--include/sys/vdev.h2
-rw-r--r--include/sys/vdev_impl.h30
-rw-r--r--include/sys/vdev_initialize.h2
-rw-r--r--include/sys/vdev_trim.h52
-rw-r--r--include/sys/zfs_context.h12
-rw-r--r--include/sys/zfs_debug.h1
-rw-r--r--include/sys/zio.h13
-rw-r--r--include/sys/zio_impl.h5
-rw-r--r--include/sys/zio_priority.h1
22 files changed, 314 insertions, 84 deletions
diff --git a/include/libzfs.h b/include/libzfs.h
index 3405bb99b..b604f1194 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -143,6 +143,9 @@ typedef enum zfs_error {
EZFS_INITIALIZING, /* currently initializing */
EZFS_NO_INITIALIZE, /* no active initialize */
EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */
+ EZFS_TRIMMING, /* currently trimming */
+ EZFS_NO_TRIM, /* no active trim */
+ EZFS_TRIM_NOTSUP, /* device does not support trim */
EZFS_UNKNOWN
} zfs_error_t;
@@ -253,12 +256,26 @@ typedef struct splitflags {
int name_flags;
} splitflags_t;
+typedef struct trimflags {
+ /* requested vdevs are for the entire pool */
+ boolean_t fullpool;
+
+ /* request a secure trim, requires support from device */
+ boolean_t secure;
+
+ /* trim at the requested rate in bytes/second */
+ uint64_t rate;
+} trimflags_t;
+
/*
* Functions to manipulate pool and vdev state
*/
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
nvlist_t *);
+extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
+ trimflags_t *);
+
extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_reguid(zpool_handle_t *);
extern int zpool_reopen_one(zpool_handle_t *, void *);
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 264ce3fa0..74a64d107 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -64,6 +64,8 @@ int lzc_unload_key(const char *);
int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t);
int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *,
nvlist_t **);
+int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t,
+ nvlist_t *, nvlist_t **);
int lzc_snaprange_space(const char *, const char *, uint64_t *);
diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h
index 274552d5d..084ea61cc 100644
--- a/include/linux/blkdev_compat.h
+++ b/include/linux/blkdev_compat.h
@@ -609,6 +609,36 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg)
#endif /* HAVE_DISCARD_GRANULARITY */
/*
+ * 2.6.32 - 4.x API,
+ * blk_queue_discard()
+ */
+#if !defined(HAVE_BLK_QUEUE_DISCARD)
+#define blk_queue_discard(q) (0);
+#endif
+
+/*
+ * 4.8 - 4.x API,
+ * blk_queue_secure_erase()
+ *
+ * 2.6.36 - 4.7 API,
+ * blk_queue_secdiscard()
+ *
+ * 2.6.x - 2.6.35 API,
+ * Unsupported by kernel
+ */
+static inline int
+blk_queue_discard_secure(struct request_queue *q)
+{
+#if defined(HAVE_BLK_QUEUE_SECURE_ERASE)
+ return (blk_queue_secure_erase(q));
+#elif defined(HAVE_BLK_QUEUE_SECDISCARD)
+ return (blk_queue_secdiscard(q));
+#else
+ return (0);
+#endif
+}
+
+/*
* Default Linux IO Scheduler,
* Setting the scheduler to noop will allow the Linux IO scheduler to
* still perform front and back merging, while leaving the request
diff --git a/include/spl/sys/Makefile.am b/include/spl/sys/Makefile.am
index e596ff373..3b5b2755a 100644
--- a/include/spl/sys/Makefile.am
+++ b/include/spl/sys/Makefile.am
@@ -11,7 +11,6 @@ KERNEL_H = \
$(top_srcdir)/include/spl/sys/ctype.h \
$(top_srcdir)/include/spl/sys/debug.h \
$(top_srcdir)/include/spl/sys/disp.h \
- $(top_srcdir)/include/spl/sys/dkioc_free_util.h \
$(top_srcdir)/include/spl/sys/dkio.h \
$(top_srcdir)/include/spl/sys/errno.h \
$(top_srcdir)/include/spl/sys/fcntl.h \
diff --git a/include/spl/sys/dkioc_free_util.h b/include/spl/sys/dkioc_free_util.h
deleted file mode 100644
index d519b2f8e..000000000
--- a/include/spl/sys/dkioc_free_util.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- * Copyright (C) 2007 The Regents of the University of California.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * Written by Brian Behlendorf <[email protected]>.
- * UCRL-CODE-235197
- *
- * This file is part of the SPL, Solaris Porting Layer.
- * For details, see <http://zfsonlinux.org/>.
- *
- * The SPL is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * The SPL is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with the SPL. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _SPL_DKIOC_UTIL_H
-#define _SPL_DKIOC_UTIL_H
-
-#include <sys/dkio.h>
-
-typedef struct dkioc_free_list_ext_s {
- uint64_t dfle_start;
- uint64_t dfle_length;
-} dkioc_free_list_ext_t;
-
-typedef struct dkioc_free_list_s {
- uint64_t dfl_flags;
- uint64_t dfl_num_exts;
- int64_t dfl_offset;
-
- /*
- * N.B. this is only an internal debugging API! This is only called
- * from debug builds of sd for pre-release checking. Remove before GA!
- */
- void (*dfl_ck_func)(uint64_t, uint64_t, void *);
- void *dfl_ck_arg;
-
- dkioc_free_list_ext_t dfl_exts[1];
-} dkioc_free_list_t;
-
-static inline void dfl_free(dkioc_free_list_t *dfl) {
- vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
-}
-
-static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) {
- return (vmem_zalloc(DFL_SZ(dfl_num_exts), flags));
-}
-
-#endif /* _SPL_DKIOC_UTIL_H */
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index e6c82d113..31ffdfb4a 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -100,6 +100,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/vdev_raidz_impl.h \
$(top_srcdir)/include/sys/vdev_removal.h \
+ $(top_srcdir)/include/sys/vdev_trim.h \
$(top_srcdir)/include/sys/xvattr.h \
$(top_srcdir)/include/sys/zap.h \
$(top_srcdir)/include/sys/zap_impl.h \
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index e49a58f43..bdc25ee9f 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -244,6 +244,7 @@ typedef enum {
ZPOOL_PROP_MULTIHOST,
ZPOOL_PROP_CHECKPOINT,
ZPOOL_PROP_LOAD_GUID,
+ ZPOOL_PROP_AUTOTRIM,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -635,6 +636,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
+#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue"
/* Queue sizes */
#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
@@ -642,6 +644,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
+#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue"
/* Latency read/write histogram stats */
#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
@@ -653,6 +656,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo"
/* Request size histograms */
#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo"
@@ -660,11 +664,13 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo"
/* Number of slow IOs */
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
@@ -777,6 +783,7 @@ typedef struct zpool_load_policy {
#define VDEV_ALLOC_BIAS_SPECIAL "special"
#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+/* vdev initialize state */
#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
"com.delphix:next_offset_to_initialize"
#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
@@ -784,6 +791,20 @@ typedef struct zpool_load_policy {
#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
"com.delphix:vdev_initialize_action_time"
+/* vdev TRIM state */
+#define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \
+ "org.zfsonlinux:next_offset_to_trim"
+#define VDEV_LEAF_ZAP_TRIM_STATE \
+ "org.zfsonlinux:vdev_trim_state"
+#define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \
+ "org.zfsonlinux:vdev_trim_action_time"
+#define VDEV_LEAF_ZAP_TRIM_RATE \
+ "org.zfsonlinux:vdev_trim_rate"
+#define VDEV_LEAF_ZAP_TRIM_PARTIAL \
+ "org.zfsonlinux:vdev_trim_partial"
+#define VDEV_LEAF_ZAP_TRIM_SECURE \
+ "org.zfsonlinux:vdev_trim_secure"
+
/*
* This is needed in userland to report the minimum necessary device size.
*/
@@ -915,6 +936,7 @@ typedef enum zio_type {
ZIO_TYPE_FREE,
ZIO_TYPE_CLAIM,
ZIO_TYPE_IOCTL,
+ ZIO_TYPE_TRIM,
ZIO_TYPES
} zio_type_t;
@@ -982,8 +1004,14 @@ typedef enum zpool_errata {
/*
* Vdev statistics. Note: all fields should be 64-bit because this
- * is passed between kernel and userland as an nvlist uint64 array.
+ * is passed between kernel and user land as an nvlist uint64 array.
+ *
+ * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in
+ * order to keep subsequent members at their known fixed offsets. When
+ * adding a new field it must be added to the end the structure.
*/
+#define VS_ZIO_TYPES 6
+
typedef struct vdev_stat {
hrtime_t vs_timestamp; /* time since vdev load */
uint64_t vs_state; /* vdev state */
@@ -993,8 +1021,8 @@ typedef struct vdev_stat {
uint64_t vs_dspace; /* deflated capacity */
uint64_t vs_rsize; /* replaceable dev size */
uint64_t vs_esize; /* expandable dev size */
- uint64_t vs_ops[ZIO_TYPES]; /* operation count */
- uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
+ uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */
+ uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
@@ -1010,6 +1038,12 @@ typedef struct vdev_stat {
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
+ uint64_t vs_trim_errors; /* trimming errors */
+ uint64_t vs_trim_notsup; /* supported by device */
+ uint64_t vs_trim_bytes_done; /* bytes trimmed */
+ uint64_t vs_trim_bytes_est; /* total bytes to trim */
+ uint64_t vs_trim_state; /* vdev_trim_state_t */
+ uint64_t vs_trim_action_time; /* time_t */
} vdev_stat_t;
/*
@@ -1068,13 +1102,23 @@ typedef struct vdev_stat_ex {
* Initialize functions.
*/
typedef enum pool_initialize_func {
- POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_START,
POOL_INITIALIZE_CANCEL,
POOL_INITIALIZE_SUSPEND,
POOL_INITIALIZE_FUNCS
} pool_initialize_func_t;
/*
+ * TRIM functions.
+ */
+typedef enum pool_trim_func {
+ POOL_TRIM_START,
+ POOL_TRIM_CANCEL,
+ POOL_TRIM_SUSPEND,
+ POOL_TRIM_FUNCS
+} pool_trim_func_t;
+
+/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -1126,6 +1170,14 @@ typedef enum {
VDEV_INITIALIZE_COMPLETE
} vdev_initializing_state_t;
+typedef enum {
+ VDEV_TRIM_NONE,
+ VDEV_TRIM_ACTIVE,
+ VDEV_TRIM_CANCELED,
+ VDEV_TRIM_SUSPENDED,
+ VDEV_TRIM_COMPLETE,
+} vdev_trim_state_t;
+
/*
* nvlist name constants. Facilitate restricting snapshot iteration range for
* the "list next snapshot" ioctl
@@ -1224,6 +1276,7 @@ typedef enum zfs_ioc {
ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */
ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */
ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */
+ ZFS_IOC_POOL_TRIM, /* 0x5a50 */
/*
* Linux - 3/64 numbers reserved.
@@ -1327,6 +1380,14 @@ typedef enum {
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
/*
+ * The following are names used when invoking ZFS_IOC_POOL_TRIM.
+ */
+#define ZPOOL_TRIM_COMMAND "trim_command"
+#define ZPOOL_TRIM_VDEVS "trim_vdevs"
+#define ZPOOL_TRIM_RATE "trim_rate"
+#define ZPOOL_TRIM_SECURE "trim_secure"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index a513a6470..2790d06c7 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -120,6 +120,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
+void metaslab_disable(metaslab_t *);
+void metaslab_enable(metaslab_t *, boolean_t);
#ifdef __cplusplus
}
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 676c5dd46..ca1104c14 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -69,7 +69,7 @@ typedef enum trace_alloc_type {
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
TRACE_VDEV_ERROR = -8ULL,
- TRACE_INITIALIZING = -9ULL
+ TRACE_DISABLED = -9ULL,
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
@@ -272,10 +272,10 @@ struct metaslab_group {
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
- int mg_ms_initializing;
- boolean_t mg_initialize_updating;
- kmutex_t mg_ms_initialize_lock;
- kcondvar_t mg_ms_initialize_cv;
+ int mg_ms_disabled;
+ boolean_t mg_disabled_updating;
+ kmutex_t mg_ms_disabled_lock;
+ kcondvar_t mg_ms_disabled_cv;
};
/*
@@ -389,11 +389,24 @@ struct metaslab {
range_tree_t *ms_defer[TXG_DEFER_SIZE];
range_tree_t *ms_checkpointing; /* to add to the checkpoint */
+ /*
+ * The ms_trim tree is the set of allocatable segments which are
+ * eligible for trimming. (When the metaslab is loaded, it's a
+ * subset of ms_allocatable.) It's kept in-core as long as the
+ * autotrim property is set and is not vacated when the metaslab
+ * is unloaded. Its purpose is to aggregate freed ranges to
+ * facilitate efficient trimming.
+ */
+ range_tree_t *ms_trim;
+
boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;
- uint64_t ms_initializing; /* leaves initializing this ms */
+ /*
+ * The number of consumers which have disabled the metaslab.
+ */
+ uint64_t ms_disabled;
/*
* We must always hold the ms_lock when modifying ms_loaded
diff --git a/include/sys/spa.h b/include/sys/spa.h
index febf0e8f2..343977b30 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -738,6 +738,24 @@ typedef enum spa_import_type {
SPA_IMPORT_ASSEMBLE
} spa_import_type_t;
+/*
+ * Send TRIM commands in-line during normal pool operation while deleting.
+ * OFF: no
+ * ON: yes
+ */
+typedef enum {
+ SPA_AUTOTRIM_OFF = 0, /* default */
+ SPA_AUTOTRIM_ON
+} spa_autotrim_t;
+
+/*
+ * Reason TRIM command was issued, used internally for accounting purposes.
+ */
+typedef enum trim_type {
+ TRIM_TYPE_MANUAL = 0,
+ TRIM_TYPE_AUTO = 1,
+} trim_type_t;
+
/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
@@ -764,15 +782,17 @@ extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
-#define SPA_ASYNC_CONFIG_UPDATE 0x01
-#define SPA_ASYNC_REMOVE 0x02
-#define SPA_ASYNC_PROBE 0x04
-#define SPA_ASYNC_RESILVER_DONE 0x08
-#define SPA_ASYNC_RESILVER 0x10
-#define SPA_ASYNC_AUTOEXPAND 0x20
-#define SPA_ASYNC_REMOVE_DONE 0x40
-#define SPA_ASYNC_REMOVE_STOP 0x80
-#define SPA_ASYNC_INITIALIZE_RESTART 0x100
+#define SPA_ASYNC_CONFIG_UPDATE 0x01
+#define SPA_ASYNC_REMOVE 0x02
+#define SPA_ASYNC_PROBE 0x04
+#define SPA_ASYNC_RESILVER_DONE 0x08
+#define SPA_ASYNC_RESILVER 0x10
+#define SPA_ASYNC_AUTOEXPAND 0x20
+#define SPA_ASYNC_REMOVE_DONE 0x40
+#define SPA_ASYNC_REMOVE_STOP 0x80
+#define SPA_ASYNC_INITIALIZE_RESTART 0x100
+#define SPA_ASYNC_TRIM_RESTART 0x200
+#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
/*
* Controls the behavior of spa_vdev_remove().
@@ -790,6 +810,8 @@ extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
nvlist_t *vdev_errlist);
+extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
+ uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -887,6 +909,7 @@ typedef struct spa_stats {
spa_history_kstat_t io_history;
spa_history_list_t mmp_history;
spa_history_kstat_t state; /* pool state */
+ spa_history_kstat_t iostats;
} spa_stats_t;
typedef enum txg_state {
@@ -905,6 +928,22 @@ typedef struct txg_stat {
uint64_t ndirty;
} txg_stat_t;
+/* Assorted pool IO kstats */
+typedef struct spa_iostats {
+ kstat_named_t trim_extents_written;
+ kstat_named_t trim_bytes_written;
+ kstat_named_t trim_extents_skipped;
+ kstat_named_t trim_bytes_skipped;
+ kstat_named_t trim_extents_failed;
+ kstat_named_t trim_bytes_failed;
+ kstat_named_t autotrim_extents_written;
+ kstat_named_t autotrim_bytes_written;
+ kstat_named_t autotrim_extents_skipped;
+ kstat_named_t autotrim_bytes_skipped;
+ kstat_named_t autotrim_extents_failed;
+ kstat_named_t autotrim_bytes_failed;
+} spa_iostats_t;
+
extern void spa_stats_init(spa_t *spa);
extern void spa_stats_destroy(spa_t *spa);
extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
@@ -922,6 +961,10 @@ extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
int error);
+extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+ uint64_t extents_written, uint64_t bytes_written,
+ uint64_t extents_skipped, uint64_t bytes_skipped,
+ uint64_t extents_failed, uint64_t bytes_failed);
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
@@ -1005,6 +1048,7 @@ extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
extern uint64_t spa_deadman_ziotime(spa_t *spa);
extern uint64_t spa_dirty_data(spa_t *spa);
+extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index c3aaad611..66032d9aa 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -378,6 +378,7 @@ struct spa {
uint64_t spa_deadman_ziotime; /* deadman zio expiration */
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
+ uint64_t spa_autotrim; /* automatic background trim? */
uint64_t spa_errata; /* errata issues detected */
spa_stats_t spa_stats; /* assorted spa statistics */
spa_keystore_t spa_keystore; /* loaded crypto keys */
diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h
index aa13bd505..2067b355a 100644
--- a/include/sys/sysevent/eventdefs.h
+++ b/include/sys/sysevent/eventdefs.h
@@ -118,6 +118,11 @@ extern "C" {
#define ESC_ZFS_BOOTFS_VDEV_ATTACH "bootfs_vdev_attach"
#define ESC_ZFS_POOL_REGUID "pool_reguid"
#define ESC_ZFS_HISTORY_EVENT "history_event"
+#define ESC_ZFS_TRIM_START "trim_start"
+#define ESC_ZFS_TRIM_FINISH "trim_finish"
+#define ESC_ZFS_TRIM_CANCEL "trim_cancel"
+#define ESC_ZFS_TRIM_RESUME "trim_resume"
+#define ESC_ZFS_TRIM_SUSPEND "trim_suspend"
/*
* datalink subclass definitions.
diff --git a/include/sys/txg.h b/include/sys/txg.h
index ed0e7297c..760d5208b 100644
--- a/include/sys/txg.h
+++ b/include/sys/txg.h
@@ -90,10 +90,11 @@ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
/*
* Wait until the given transaction group, or one after it, is
* the open transaction group. Try to make this happen as soon
- * as possible (eg. kick off any necessary syncs immediately).
- * If txg == 0, wait for the next open txg.
+ * as possible (eg. kick off any necessary syncs immediately) when
+ * should_quiesce is set. If txg == 0, wait for the next open txg.
*/
-extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg,
+ boolean_t should_quiesce);
/*
* Returns TRUE if we are "backed up" waiting for the syncing
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 2091892b2..67ca0d116 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -95,6 +95,8 @@ extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+ range_seg_t *physical_rs);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index c115a5e10..f6f7bbb4b 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -145,6 +145,7 @@ struct vdev_queue {
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
+ avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
@@ -260,6 +261,7 @@ struct vdev {
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+ /* Initialize related */
boolean_t vdev_initialize_exit_wanted;
vdev_initializing_state_t vdev_initialize_state;
list_node_t vdev_initialize_node;
@@ -274,10 +276,34 @@ struct vdev {
uint64_t vdev_initialize_bytes_done;
time_t vdev_initialize_action_time; /* start and end time */
- /* for limiting outstanding I/Os */
+ /* TRIM related */
+ boolean_t vdev_trim_exit_wanted;
+ boolean_t vdev_autotrim_exit_wanted;
+ vdev_trim_state_t vdev_trim_state;
+ list_node_t vdev_trim_node;
+ kmutex_t vdev_autotrim_lock;
+ kcondvar_t vdev_autotrim_cv;
+ kthread_t *vdev_autotrim_thread;
+ /* Protects vdev_trim_thread and vdev_trim_state. */
+ kmutex_t vdev_trim_lock;
+ kcondvar_t vdev_trim_cv;
+ kthread_t *vdev_trim_thread;
+ uint64_t vdev_trim_offset[TXG_SIZE];
+ uint64_t vdev_trim_last_offset;
+ uint64_t vdev_trim_bytes_est;
+ uint64_t vdev_trim_bytes_done;
+ uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */
+ uint64_t vdev_trim_partial; /* requested partial TRIM */
+ uint64_t vdev_trim_secure; /* requested secure TRIM */
+ time_t vdev_trim_action_time; /* start and end time */
+
+ /* for limiting outstanding I/Os (initialize and TRIM) */
kmutex_t vdev_initialize_io_lock;
kcondvar_t vdev_initialize_io_cv;
uint64_t vdev_initialize_inflight;
+ kmutex_t vdev_trim_io_lock;
+ kcondvar_t vdev_trim_io_cv;
+ uint64_t vdev_trim_inflight[2];
/*
* Values stored in the config for an indirect or removing vdev.
@@ -343,6 +369,8 @@ struct vdev {
uint64_t vdev_not_present; /* not present during import */
uint64_t vdev_unspare; /* unspare when resilvering done */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+ boolean_t vdev_has_trim; /* TRIM is supported */
+ boolean_t vdev_has_securetrim; /* secure TRIM is supported */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h
index 319fb9bc0..81d39ebeb 100644
--- a/include/sys/vdev_initialize.h
+++ b/include/sys/vdev_initialize.h
@@ -39,8 +39,6 @@ extern void vdev_initialize_stop_all(vdev_t *vd,
vdev_initializing_state_t tgt_state);
extern void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list);
extern void vdev_initialize_restart(vdev_t *vd);
-extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
- range_seg_t *physical_rs);
#ifdef __cplusplus
}
diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h
new file mode 100644
index 000000000..1e5401766
--- /dev/null
+++ b/include/sys/vdev_trim.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SYS_VDEV_TRIM_H
+#define _SYS_VDEV_TRIM_H
+
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern unsigned int zfs_trim_metaslab_skip;
+
+extern void vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial,
+ boolean_t secure);
+extern void vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt, list_t *vd_list);
+extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state);
+extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list);
+extern void vdev_trim_restart(vdev_t *vd);
+extern void vdev_autotrim(spa_t *spa);
+extern void vdev_autotrim_stop_all(spa_t *spa);
+extern void vdev_autotrim_stop_wait(vdev_t *vd);
+extern void vdev_autotrim_restart(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_TRIM_H */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 260b8a458..87ddde30a 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -579,6 +579,8 @@ typedef struct vsecattr {
#define CRCREAT 0
+#define F_FREESP 11
+
extern int fop_getattr(vnode_t *vp, vattr_t *vap);
#define VOP_CLOSE(vp, f, c, o, cr, ct) vn_close(vp)
@@ -587,6 +589,16 @@ extern int fop_getattr(vnode_t *vp, vattr_t *vap);
#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd)
+#if defined(HAVE_FILE_FALLOCATE) && \
+ defined(FALLOC_FL_PUNCH_HOLE) && \
+ defined(FALLOC_FL_KEEP_SIZE)
+#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) \
+ fallocate((vp)->v_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, \
+ (flck)->l_start, (flck)->l_len)
+#else
+#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) (0)
+#endif
+
#define VN_RELE(vp) vn_close(vp)
extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h
index 7564ae0e4..7968a01cd 100644
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -54,6 +54,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
#define ZFS_DEBUG_SET_ERROR (1 << 9)
#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10)
+#define ZFS_DEBUG_TRIM (1 << 11)
extern void __zfs_dbgmsg(char *buf);
extern void __dprintf(boolean_t dprint, const char *file, const char *func,
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 4b7ad3e22..e69bf9208 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -416,6 +416,14 @@ typedef zio_t *zio_pipe_stage_t(zio_t *zio);
#define ZIO_REEXECUTE_NOW 0x01
#define ZIO_REEXECUTE_SUSPEND 0x02
+/*
+ * The io_trim flags are used to specify the type of TRIM to perform. They
+ * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
+ */
+enum trim_flag {
+ ZIO_TRIM_SECURE = 1 << 0,
+};
+
typedef struct zio_alloc_list {
list_t zal_list;
uint64_t zal_size;
@@ -434,6 +442,7 @@ struct zio {
zio_prop_t io_prop;
zio_type_t io_type;
enum zio_child io_child_type;
+ enum trim_flag io_trim_flags;
int io_cmd;
zio_priority_t io_priority;
uint8_t io_reexecute;
@@ -549,6 +558,10 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, enum zio_flag flags);
+extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ zio_done_func_t *done, void *private, zio_priority_t priority,
+ enum zio_flag flags, enum trim_flag trim_flags);
+
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority,
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index 344048c6a..fbbe06eb0 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -250,6 +250,11 @@ enum zio_stage {
ZIO_STAGE_VDEV_IO_START | \
ZIO_STAGE_VDEV_IO_ASSESS)
+#define ZIO_TRIM_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_VDEV_IO_STAGES)
+
#define ZIO_BLOCKING_STAGES \
(ZIO_STAGE_DVA_ALLOCATE | \
ZIO_STAGE_DVA_CLAIM | \
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index d8e6a1745..0b422904e 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -30,6 +30,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
+ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;