aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys/fs
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2019-03-29 09:13:20 -0700
committerGitHub <[email protected]>2019-03-29 09:13:20 -0700
commit1b939560be5c51deecf875af9dada9d094633bf7 (patch)
tree2a780b838134636ddbc65f89d227e37c74abe17b /include/sys/fs
parentf94b3cbf43d62f4962e71cfe7ba8c6f0602e2a45 (diff)
Add TRIM support
UNMAP/TRIM support is a frequently-requested feature to help prevent performance from degrading on SSDs and on various other SAN-like storage back-ends. By issuing UNMAP/TRIM commands for sectors which are no longer allocated the underlying device can often more efficiently manage itself. This TRIM implementation is modeled on the `zpool initialize` feature which writes a pattern to all unallocated space in the pool. The new `zpool trim` command uses the same vdev_xlate() code to calculate what sectors are unallocated, the same per- vdev TRIM thread model and locking, and the same basic CLI for a consistent user experience. The core difference is that instead of writing a pattern it will issue UNMAP/TRIM commands for those extents. The zio pipeline was updated to accommodate this by adding a new ZIO_TYPE_TRIM type and associated spa taskq. This new type makes is straight forward to add the platform specific TRIM/UNMAP calls to vdev_disk.c and vdev_file.c. These new ZIO_TYPE_TRIM zios are handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs. This makes it possible to largely avoid changing the pipieline, one exception is that TRIM zio's may exceed the 16M block size limit since they contain no data. In addition to the manual `zpool trim` command, a background automatic TRIM was added and is controlled by the 'autotrim' property. It relies on the exact same infrastructure as the manual TRIM. However, instead of relying on the extents in a metaslab's ms_allocatable range tree, a ms_trim tree is kept per metaslab. When 'autotrim=on', ranges added back to the ms_allocatable tree are also added to the ms_free tree. The ms_free tree is then periodically consumed by an autotrim thread which systematically walks a top level vdev's metaslabs. Since the automatic TRIM will skip ranges it considers too small there is value in occasionally running a full `zpool trim`. This may occur when the freed blocks are small and not enough time was allowed to aggregate them. An automatic TRIM and a manual `zpool trim` may be run concurrently, in which case the automatic TRIM will yield to the manual TRIM. Reviewed-by: Jorgen Lundman <[email protected]> Reviewed-by: Tim Chase <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Contributions-by: Saso Kiselkov <[email protected]> Contributions-by: Tim Chase <[email protected]> Contributions-by: Chunwei Chen <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #8419 Closes #598
Diffstat (limited to 'include/sys/fs')
-rw-r--r--include/sys/fs/zfs.h69
1 files changed, 65 insertions, 4 deletions
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index e49a58f43..bdc25ee9f 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -244,6 +244,7 @@ typedef enum {
ZPOOL_PROP_MULTIHOST,
ZPOOL_PROP_CHECKPOINT,
ZPOOL_PROP_LOAD_GUID,
+ ZPOOL_PROP_AUTOTRIM,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -635,6 +636,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
+#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue"
/* Queue sizes */
#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
@@ -642,6 +644,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
+#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue"
/* Latency read/write histogram stats */
#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
@@ -653,6 +656,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo"
/* Request size histograms */
#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo"
@@ -660,11 +664,13 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo"
/* Number of slow IOs */
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
@@ -777,6 +783,7 @@ typedef struct zpool_load_policy {
#define VDEV_ALLOC_BIAS_SPECIAL "special"
#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+/* vdev initialize state */
#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
"com.delphix:next_offset_to_initialize"
#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
@@ -784,6 +791,20 @@ typedef struct zpool_load_policy {
#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
"com.delphix:vdev_initialize_action_time"
+/* vdev TRIM state */
+#define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \
+ "org.zfsonlinux:next_offset_to_trim"
+#define VDEV_LEAF_ZAP_TRIM_STATE \
+ "org.zfsonlinux:vdev_trim_state"
+#define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \
+ "org.zfsonlinux:vdev_trim_action_time"
+#define VDEV_LEAF_ZAP_TRIM_RATE \
+ "org.zfsonlinux:vdev_trim_rate"
+#define VDEV_LEAF_ZAP_TRIM_PARTIAL \
+ "org.zfsonlinux:vdev_trim_partial"
+#define VDEV_LEAF_ZAP_TRIM_SECURE \
+ "org.zfsonlinux:vdev_trim_secure"
+
/*
* This is needed in userland to report the minimum necessary device size.
*/
@@ -915,6 +936,7 @@ typedef enum zio_type {
ZIO_TYPE_FREE,
ZIO_TYPE_CLAIM,
ZIO_TYPE_IOCTL,
+ ZIO_TYPE_TRIM,
ZIO_TYPES
} zio_type_t;
@@ -982,8 +1004,14 @@ typedef enum zpool_errata {
/*
* Vdev statistics. Note: all fields should be 64-bit because this
- * is passed between kernel and userland as an nvlist uint64 array.
+ * is passed between kernel and user land as an nvlist uint64 array.
+ *
+ * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in
+ * order to keep subsequent members at their known fixed offsets. When
+ * adding a new field it must be added to the end the structure.
*/
+#define VS_ZIO_TYPES 6
+
typedef struct vdev_stat {
hrtime_t vs_timestamp; /* time since vdev load */
uint64_t vs_state; /* vdev state */
@@ -993,8 +1021,8 @@ typedef struct vdev_stat {
uint64_t vs_dspace; /* deflated capacity */
uint64_t vs_rsize; /* replaceable dev size */
uint64_t vs_esize; /* expandable dev size */
- uint64_t vs_ops[ZIO_TYPES]; /* operation count */
- uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
+ uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */
+ uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
@@ -1010,6 +1038,12 @@ typedef struct vdev_stat {
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
+ uint64_t vs_trim_errors; /* trimming errors */
+ uint64_t vs_trim_notsup; /* supported by device */
+ uint64_t vs_trim_bytes_done; /* bytes trimmed */
+ uint64_t vs_trim_bytes_est; /* total bytes to trim */
+ uint64_t vs_trim_state; /* vdev_trim_state_t */
+ uint64_t vs_trim_action_time; /* time_t */
} vdev_stat_t;
/*
@@ -1068,13 +1102,23 @@ typedef struct vdev_stat_ex {
* Initialize functions.
*/
typedef enum pool_initialize_func {
- POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_START,
POOL_INITIALIZE_CANCEL,
POOL_INITIALIZE_SUSPEND,
POOL_INITIALIZE_FUNCS
} pool_initialize_func_t;
/*
+ * TRIM functions.
+ */
+typedef enum pool_trim_func {
+ POOL_TRIM_START,
+ POOL_TRIM_CANCEL,
+ POOL_TRIM_SUSPEND,
+ POOL_TRIM_FUNCS
+} pool_trim_func_t;
+
+/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -1126,6 +1170,14 @@ typedef enum {
VDEV_INITIALIZE_COMPLETE
} vdev_initializing_state_t;
+typedef enum {
+ VDEV_TRIM_NONE,
+ VDEV_TRIM_ACTIVE,
+ VDEV_TRIM_CANCELED,
+ VDEV_TRIM_SUSPENDED,
+ VDEV_TRIM_COMPLETE,
+} vdev_trim_state_t;
+
/*
* nvlist name constants. Facilitate restricting snapshot iteration range for
* the "list next snapshot" ioctl
@@ -1224,6 +1276,7 @@ typedef enum zfs_ioc {
ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */
ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */
ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */
+ ZFS_IOC_POOL_TRIM, /* 0x5a50 */
/*
* Linux - 3/64 numbers reserved.
@@ -1327,6 +1380,14 @@ typedef enum {
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
/*
+ * The following are names used when invoking ZFS_IOC_POOL_TRIM.
+ */
+#define ZPOOL_TRIM_COMMAND "trim_command"
+#define ZPOOL_TRIM_VDEVS "trim_vdevs"
+#define ZPOOL_TRIM_RATE "trim_rate"
+#define ZPOOL_TRIM_SECURE "trim_secure"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1