diff options
author | Brian Behlendorf <[email protected]> | 2019-03-29 09:13:20 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2019-03-29 09:13:20 -0700 |
commit | 1b939560be5c51deecf875af9dada9d094633bf7 (patch) | |
tree | 2a780b838134636ddbc65f89d227e37c74abe17b /lib/libzfs/libzfs_pool.c | |
parent | f94b3cbf43d62f4962e71cfe7ba8c6f0602e2a45 (diff) |
Add TRIM support
UNMAP/TRIM support is a frequently-requested feature to help
prevent performance from degrading on SSDs and on various other
SAN-like storage back-ends. By issuing UNMAP/TRIM commands for
sectors which are no longer allocated the underlying device can
often more efficiently manage itself.
This TRIM implementation is modeled on the `zpool initialize`
feature which writes a pattern to all unallocated space in the
pool. The new `zpool trim` command uses the same vdev_xlate()
code to calculate what sectors are unallocated, the same per-
vdev TRIM thread model and locking, and the same basic CLI for
a consistent user experience. The core difference is that
instead of writing a pattern it will issue UNMAP/TRIM commands
for those extents.
The zio pipeline was updated to accommodate this by adding a new
ZIO_TYPE_TRIM type and associated spa taskq. This new type makes
is straight forward to add the platform specific TRIM/UNMAP calls
to vdev_disk.c and vdev_file.c. These new ZIO_TYPE_TRIM zios are
handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs.
This makes it possible to largely avoid changing the pipieline,
one exception is that TRIM zio's may exceed the 16M block size
limit since they contain no data.
In addition to the manual `zpool trim` command, a background
automatic TRIM was added and is controlled by the 'autotrim'
property. It relies on the exact same infrastructure as the
manual TRIM. However, instead of relying on the extents in a
metaslab's ms_allocatable range tree, a ms_trim tree is kept
per metaslab. When 'autotrim=on', ranges added back to the
ms_allocatable tree are also added to the ms_free tree. The
ms_free tree is then periodically consumed by an autotrim
thread which systematically walks a top level vdev's metaslabs.
Since the automatic TRIM will skip ranges it considers too small
there is value in occasionally running a full `zpool trim`. This
may occur when the freed blocks are small and not enough time
was allowed to aggregate them. An automatic TRIM and a manual
`zpool trim` may be run concurrently, in which case the automatic
TRIM will yield to the manual TRIM.
Reviewed-by: Jorgen Lundman <[email protected]>
Reviewed-by: Tim Chase <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: George Wilson <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Contributions-by: Saso Kiselkov <[email protected]>
Contributions-by: Tim Chase <[email protected]>
Contributions-by: Chunwei Chen <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #8419
Closes #598
Diffstat (limited to 'lib/libzfs/libzfs_pool.c')
-rw-r--r-- | lib/libzfs/libzfs_pool.c | 211 |
1 files changed, 171 insertions, 40 deletions
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f799471e4..6c797d06b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2092,6 +2092,57 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, return (ret); } +/* + * Translate vdev names to guids. If a vdev_path is determined to be + * unsuitable then a vd_errlist is allocated and the vdev path and errno + * are added to it. + */ +static int +zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds, + nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist) +{ + nvlist_t *errlist = NULL; + int error = 0; + + for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL; + elem = nvlist_next_nvpair(vds, elem)) { + boolean_t spare, cache; + + char *vd_path = nvpair_name(elem); + nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, + NULL); + + if ((tgt == NULL) || cache || spare) { + if (errlist == NULL) { + errlist = fnvlist_alloc(); + error = EINVAL; + } + + uint64_t err = (tgt == NULL) ? EZFS_NODEVICE : + (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); + fnvlist_add_int64(errlist, vd_path, err); + continue; + } + + uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + fnvlist_add_uint64(vdev_guids, vd_path, guid); + + char msg[MAXNAMELEN]; + (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); + fnvlist_add_string(guids_to_paths, msg, vd_path); + } + + if (error != 0) { + verify(errlist != NULL); + if (vd_errlist != NULL) + *vd_errlist = errlist; + else + fnvlist_free(errlist); + } + + return (error); +} + static int xlate_init_err(int err) { @@ -2118,72 +2169,152 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - nvlist_t *errlist; + int err; - /* translate vdev names to guids */ nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); - boolean_t spare, cache; - nvlist_t *tgt; + nvlist_t *vd_errlist = NULL; + nvlist_t *errlist; nvpair_t *elem; - for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; - elem = nvlist_next_nvpair(vds, elem)) { - char *vd_path = nvpair_name(elem); - tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); + err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, + guids_to_paths, &vd_errlist); - if ((tgt == NULL) || cache || spare) { - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot initialize '%s'"), - vd_path); - int err = (tgt == NULL) ? EZFS_NODEVICE : - (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); + if (err == 0) { + err = lzc_initialize(zhp->zpool_name, cmd_type, + vdev_guids, &errlist); + if (err == 0) { fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); - return (zfs_error(hdl, err, msg)); + return (0); } - uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); - fnvlist_add_uint64(vdev_guids, vd_path, guid); + if (errlist != NULL) { + vd_errlist = fnvlist_lookup_nvlist(errlist, + ZPOOL_INITIALIZE_VDEVS); + } - (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); - fnvlist_add_string(guids_to_paths, msg, vd_path); + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "operation failed")); + } else { + verify(vd_errlist != NULL); + } + + for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; + elem = nvlist_next_nvpair(vd_errlist, elem)) { + int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); + char *path; + + if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), + &path) != 0) + path = nvpair_name(elem); + + (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, + "cannot initialize '%s'", path); } - int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, - &errlist); fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); - if (err == 0) { - fnvlist_free(guids_to_paths); - return (0); + if (vd_errlist != NULL) { + fnvlist_free(vd_errlist); + return (-1); + } + + return (zpool_standard_error(zhp->zpool_hdl, err, msg)); +} + +static int +xlate_trim_err(int err) +{ + switch (err) { + case ENODEV: + return (EZFS_NODEVICE); + case EINVAL: + case EROFS: + return (EZFS_BADDEV); + case EBUSY: + return (EZFS_TRIMMING); + case ESRCH: + return (EZFS_NO_TRIM); + case EOPNOTSUPP: + return (EZFS_TRIM_NOTSUP); } + return (err); +} + +/* + * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for + * the given vdevs in the given pool. + */ +int +zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, + trimflags_t *trim_flags) +{ + char msg[1024]; + int err; + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *vd_errlist = NULL; - if (errlist != NULL) { - vd_errlist = fnvlist_lookup_nvlist(errlist, - ZPOOL_INITIALIZE_VDEVS); + nvlist_t *errlist; + nvpair_t *elem; + + err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, + guids_to_paths, &vd_errlist); + if (err == 0) { + err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, + trim_flags->secure, vdev_guids, &errlist); + if (err == 0) { + fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); + return (0); + } + + if (errlist != NULL) { + vd_errlist = fnvlist_lookup_nvlist(errlist, + ZPOOL_TRIM_VDEVS); + } + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "operation failed")); + } else { + verify(vd_errlist != NULL); } - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "operation failed")); + for (elem = nvlist_next_nvpair(vd_errlist, NULL); + elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { + int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem)); + char *path; - for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; - elem = nvlist_next_nvpair(vd_errlist, elem)) { - int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); - char *path = fnvlist_lookup_string(guids_to_paths, - nvpair_name(elem)); - (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'", - path); + /* + * If only the pool was specified, and it was not a secure + * trim then suppress warnings for individual vdevs which + * do not support trimming. + */ + if (vd_error == EZFS_TRIM_NOTSUP && + trim_flags->fullpool && + !trim_flags->secure) { + continue; + } + + if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), + &path) != 0) + path = nvpair_name(elem); + + (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, + "cannot trim '%s'", path); } + fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); - if (vd_errlist != NULL) + + if (vd_errlist != NULL) { + fnvlist_free(vd_errlist); return (-1); + } - return (zpool_standard_error(hdl, err, msg)); + return (zpool_standard_error(zhp->zpool_hdl, err, msg)); } /* |