summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c141
-rw-r--r--module/zfs/spa.c20
-rw-r--r--module/zfs/vdev.c13
-rw-r--r--module/zfs/vdev_removal.c14
-rw-r--r--module/zfs/vdev_trim.c245
5 files changed, 392 insertions, 41 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 29da08a49..e7ad976af 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -301,6 +301,7 @@
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
#include <cityhash.h>
+#include <sys/vdev_trim.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@@ -854,7 +855,6 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
-static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -865,6 +865,23 @@ static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
/*
+ * L2ARC TRIM
+ * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
+ * the current write size (l2arc_write_max) we should TRIM if we
+ * have filled the device. It is defined as a percentage of the
+ * write size. If set to 100 we trim twice the space required to
+ * accommodate upcoming writes. A minimum of 64MB will be trimmed.
+ * It also enables TRIM of the whole L2ARC device upon creation or
+ * addition to an existing pool or if the header of the device is
+ * invalid upon importing a pool or onlining a cache device. The
+ * default is 0, which disables TRIM on L2ARC altogether as it can
+ * put significant stress on the underlying storage devices. This
+ * will vary depending of how well the specific device handles
+ * these commands.
+ */
+unsigned long l2arc_trim_ahead = 0;
+
+/*
* Performance tuning of L2ARC persistence:
*
* l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
@@ -902,7 +919,6 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
-static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
@@ -7709,7 +7725,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size, dev_size;
+ uint64_t size, dev_size, tsize;
/*
* Make sure our globals have meaningful values in case the user
@@ -7732,7 +7748,12 @@ l2arc_write_size(l2arc_dev_t *dev)
* iteration can occur.
*/
dev_size = dev->l2ad_end - dev->l2ad_start;
- if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
+ tsize = size + l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
+ tsize += MAX(64 * 1024 * 1024,
+ (tsize * l2arc_trim_ahead) / 100);
+
+ if (tsize >= dev_size) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
@@ -7810,10 +7831,12 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all)
next = NULL;
l2arc_dev_last = next;
@@ -8336,8 +8359,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
- boolean_t rerun;
l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+ vdev_t *vd = dev->l2ad_vdev;
+ boolean_t rerun;
buflist = &dev->l2ad_buflist;
@@ -8345,6 +8369,14 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
* We need to add in the worst case scenario of log block overhead.
*/
distance += l2arc_log_blk_overhead(distance, dev);
+ if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
+ /*
+ * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+ * times the write size, whichever is greater.
+ */
+ distance += MAX(64 * 1024 * 1024,
+ (distance * l2arc_trim_ahead) / 100);
+ }
top:
rerun = B_FALSE;
@@ -8365,25 +8397,51 @@ top:
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
- /*
- * This check has to be placed after deciding whether to iterate
- * (rerun).
- */
- if (!all && dev->l2ad_first) {
+ if (!all) {
/*
- * This is the first sweep through the device. There is
- * nothing to evict.
+ * This check has to be placed after deciding whether to
+ * iterate (rerun).
*/
- goto out;
- }
+ if (dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict. We have already trimmmed the
+ * whole device.
+ */
+ goto out;
+ } else {
+ /*
+ * Trim the space to be evicted.
+ */
+ if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
+ l2arc_trim_ahead > 0) {
+ /*
+ * We have to drop the spa_config lock because
+ * vdev_trim_range() will acquire it.
+ * l2ad_evict already accounts for the label
+ * size. To prevent vdev_trim_ranges() from
+ * adding it again, we subtract it from
+ * l2ad_evict.
+ */
+ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
+ vdev_trim_simple(vd,
+ dev->l2ad_evict - VDEV_LABEL_START_SIZE,
+ taddr - dev->l2ad_evict);
+ spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
+ RW_READER);
+ }
- /*
- * When rebuilding L2ARC we retrieve the evict hand from the header of
- * the device. Of note, l2arc_evict() does not actually delete buffers
- * from the cache device, but keeping track of the evict hand will be
- * useful when TRIM is implemented.
- */
- dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand
+ * from the header of the device. Of note, l2arc_evict()
+ * does not actually delete buffers from the cache
+ * device, but trimming may do so depending on the
+ * hardware implementation. Thus keeping track of the
+ * evict hand is useful.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ }
+ }
retry:
mutex_enter(&dev->l2ad_mtx);
@@ -8410,7 +8468,7 @@ retry:
if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
break;
} else {
- vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+ vdev_space_update(vd, -asize, 0, 0);
ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
@@ -9015,7 +9073,7 @@ l2arc_vdev_present(vdev_t *vd)
* Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
* the vdev_t isn't an L2ARC device.
*/
-static l2arc_dev_t *
+l2arc_dev_t *
l2arc_vdev_get(vdev_t *vd)
{
l2arc_dev_t *dev;
@@ -9059,6 +9117,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ adddev->l2ad_trim_all = B_FALSE;
list_link_init(&adddev->l2ad_node);
adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
@@ -9164,11 +9223,21 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
dev->l2ad_rebuild = B_TRUE;
} else if (spa_writeable(spa)) {
/*
- * In this case create a new header. We zero out the memory
- * holding the header to reset dh_start_lbps.
+ * In this case TRIM the whole device if l2arc_trim_ahead > 0,
+ * otherwise create a new header. We zero out the memory holding
+ * the header to reset dh_start_lbps. If we TRIM the whole
+ * device the new header will be written by
+ * vdev_trim_l2arc_thread() at the end of the TRIM to update the
+ * trim_state in the header too. When reading the header, if
+ * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
+ * we opt to TRIM the whole device again.
*/
- bzero(l2dhdr, l2dhdr_asize);
- l2arc_dev_hdr_update(dev);
+ if (l2arc_trim_ahead > 0) {
+ dev->l2ad_trim_all = B_TRUE;
+ } else {
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
}
}
@@ -9385,6 +9454,9 @@ l2arc_rebuild(l2arc_dev_t *dev)
dev->l2ad_start);
dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+ vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
+ vd->vdev_trim_state = l2dhdr->dh_trim_state;
+
/*
* In case the zfs module parameter l2arc_rebuild_enabled is false
* we do not start the rebuild process.
@@ -9594,7 +9666,9 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
l2dhdr->dh_end != dev->l2ad_end ||
!l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
- l2dhdr->dh_evict)) {
+ l2dhdr->dh_evict) ||
+ (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
+ l2arc_trim_ahead > 0)) {
/*
* Attempt to rebuild a device containing no actual dev hdr
* or containing a header from some other pool or from another
@@ -9903,7 +9977,7 @@ l2arc_log_blk_fetch_abort(zio_t *zio)
* Creates a zio to update the device header on an l2arc device. The zio is
* initiated as a child of `pio'.
*/
-static void
+void
l2arc_dev_hdr_update(l2arc_dev_t *dev)
{
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
@@ -9924,6 +9998,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
l2dhdr->dh_flags = 0;
+ l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
+ l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
if (dev->l2ad_first)
l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
@@ -10260,6 +10336,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
"Compressed l2arc_headroom multiplier");
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+ "TRIM ahead L2ARC write size multiplier");
+
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
"Seconds between L2ARC writing");
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 87af3073a..ba9a0dce3 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1896,6 +1896,15 @@ spa_load_l2cache(spa_t *spa)
if (!vdev_is_dead(vd))
l2arc_add_vdev(spa, vd);
+
+ /*
+ * Upon cache device addition to a pool or pool
+ * creation with a cache device or if the header
+ * of the device is invalid we issue an async
+ * TRIM command for the whole device which will
+ * execute if l2arc_trim_ahead > 0.
+ */
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
}
}
@@ -7994,6 +8003,17 @@ spa_async_thread(void *arg)
}
/*
+ * Kick off L2 cache whole device TRIM.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_trim_l2arc(spa);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
* Kick off L2 cache rebuilding.
*/
if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 923bf2e33..83c39d119 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2281,9 +2281,6 @@ vdev_reopen(vdev_t *vd)
if (vdev_readable(vd) && vdev_writeable(vd) &&
vd->vdev_aux == &spa->spa_l2cache) {
/*
- * When reopening we can assume the device label has
- * already the attribute l2cache_persistent, since we've
- * opened the device in the past and updated the label.
* In case the vdev is present we should evict all ARC
* buffers and pointers to log blocks and reclaim their
* space before restoring its contents to L2ARC.
@@ -2294,6 +2291,7 @@ vdev_reopen(vdev_t *vd)
l2arc_add_vdev(spa, vd);
}
spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
}
} else {
(void) vdev_validate(vd);
@@ -3542,9 +3540,14 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
}
mutex_exit(&vd->vdev_initialize_lock);
- /* Restart trimming if necessary */
+ /*
+ * Restart trimming if necessary. We do not restart trimming for cache
+ * devices here. This is triggered by l2arc_rebuild_vdev()
+ * asynchronously for the whole device or in l2arc_evict() as it evicts
+ * space for upcoming writes.
+ */
mutex_enter(&vd->vdev_trim_lock);
- if (vdev_writeable(vd) &&
+ if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
vd->vdev_trim_thread == NULL &&
vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 3f4f9091f..56e420871 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -2224,6 +2224,20 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
* Cache devices can always be removed.
*/
vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+
+ /*
+ * Stop trimming the cache device. We need to release the
+ * config lock to allow the syncing of TRIM transactions
+ * without releasing the spa_namespace_lock. The same
+ * strategy is employed in spa_vdev_remove_top().
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+ mutex_exit(&vd->vdev_trim_lock);
+ txg = spa_vdev_config_enter(spa);
+
ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index b0cd40f68..0254c2904 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -34,6 +34,7 @@
#include <sys/dsl_synctask.h>
#include <sys/zap.h>
#include <sys/dmu_tx.h>
+#include <sys/arc_impl.h>
/*
* TRIM is a feature which is used to notify a SSD that some previously
@@ -423,6 +424,35 @@ vdev_autotrim_cb(zio_t *zio)
}
/*
+ * The zio_done_func_t done callback for each TRIM issued via
+ * vdev_trim_simple(). It is responsible for updating the TRIM stats and
+ * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best
+ * effort and are never reissued on failure.
+ */
+static void
+vdev_trim_simple_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+
+ if (zio->io_error != 0) {
+ vd->vdev_stat.vs_trim_errors++;
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+ 0, 0, 0, 0, 1, zio->io_orig_size);
+ } else {
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+ 1, zio->io_orig_size, 0, 0, 0, 0);
+ }
+
+ ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0);
+ vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--;
+ cv_broadcast(&vd->vdev_trim_io_cv);
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+/*
* Returns the average trim rate in bytes/sec for the ta->trim_vdev.
*/
static uint64_t
@@ -441,6 +471,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
{
vdev_t *vd = ta->trim_vdev;
spa_t *spa = vd->vdev_spa;
+ void *cb;
mutex_enter(&vd->vdev_trim_io_lock);
@@ -459,8 +490,8 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
ta->trim_bytes_done += size;
/* Limit in flight trimming I/Os */
- while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >=
- zfs_trim_queue_limit) {
+ while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] +
+ vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) {
cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
}
vd->vdev_trim_inflight[ta->trim_type]++;
@@ -505,10 +536,17 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
if (ta->trim_type == TRIM_TYPE_MANUAL)
vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
+ if (ta->trim_type == TRIM_TYPE_MANUAL) {
+ cb = vdev_trim_cb;
+ } else if (ta->trim_type == TRIM_TYPE_AUTO) {
+ cb = vdev_autotrim_cb;
+ } else {
+ cb = vdev_trim_simple_cb;
+ }
+
zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
- start, size, ta->trim_type == TRIM_TYPE_MANUAL ?
- vdev_trim_cb : vdev_autotrim_cb, NULL,
- ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags));
+ start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL,
+ ta->trim_flags));
/* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
dmu_tx_commit(tx);
@@ -1016,6 +1054,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
{
spa_t *spa = vd->vdev_spa;
list_t vd_list;
+ vdev_t *vd_l2cache;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
@@ -1023,6 +1062,17 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
offsetof(vdev_t, vdev_trim_node));
vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
+
+ /*
+ * Iterate over cache devices and request stop trimming the
+ * whole device in case we export the pool or remove the cache
+ * device prematurely.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vd_l2cache = spa->spa_l2cache.sav_vdevs[i];
+ vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list);
+ }
+
vdev_trim_stop_wait(spa, &vd_list);
if (vd->vdev_spa->spa_sync_on) {
@@ -1437,6 +1487,189 @@ vdev_autotrim_restart(spa_t *spa)
vdev_autotrim(spa);
}
+static void
+vdev_trim_l2arc_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ l2arc_dev_t *dev = l2arc_vdev_get(vd);
+ trim_args_t ta;
+ range_seg64_t physical_rs;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_trim_last_offset = 0;
+ vd->vdev_trim_rate = 0;
+ vd->vdev_trim_partial = 0;
+ vd->vdev_trim_secure = 0;
+
+ bzero(&ta, sizeof (ta));
+ ta.trim_vdev = vd;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_MANUAL;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ ta.trim_flags = 0;
+
+ physical_rs.rs_start = vd->vdev_trim_bytes_done = 0;
+ physical_rs.rs_end = vd->vdev_trim_bytes_est =
+ vdev_get_min_asize(vd);
+
+ range_tree_add(ta.trim_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ (void) vdev_trim_ranges(&ta);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ range_tree_destroy(ta.trim_tree);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+ vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+ vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ ASSERT(vd->vdev_trim_thread != NULL ||
+ vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0);
+
+ /*
+ * Drop the vdev_trim_lock while we sync out the txg since it's
+ * possible that a device might be trying to come online and
+ * must check to see if it needs to restart a trim. That thread
+ * will be holding the spa_config_lock which would prevent the
+ * txg_wait_synced from completing. Same strategy as in
+ * vdev_trim_thread().
+ */
+ mutex_exit(&vd->vdev_trim_lock);
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ mutex_enter(&vd->vdev_trim_lock);
+
+ /*
+ * Update the header of the cache device here, before
+ * broadcasting vdev_trim_cv which may lead to the removal
+ * of the device. The same applies for setting l2ad_trim_all to
+ * false.
+ */
+ spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
+ RW_READER);
+ bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
+ l2arc_dev_hdr_update(dev);
+ spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
+
+ vd->vdev_trim_thread = NULL;
+ if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE)
+ dev->l2ad_trim_all = B_FALSE;
+
+ cv_broadcast(&vd->vdev_trim_cv);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ thread_exit();
+}
+
+/*
+ * Punches out TRIM threads for the L2ARC devices in a spa and assigns them
+ * to vd->vdev_trim_thread variable. This facilitates the management of
+ * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition
+ * to a pool or pool creation or when the header of the device is invalid.
+ */
+void
+vdev_trim_l2arc(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off TRIM threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vdev_t *vd = spa->spa_l2cache.sav_vdevs[i];
+ l2arc_dev_t *dev = l2arc_vdev_get(vd);
+
+ if (dev == NULL || !dev->l2ad_trim_all) {
+ /*
+ * Don't attempt TRIM if the vdev is UNAVAIL or if the
+ * cache device was not marked for whole device TRIM
+ * (ie l2arc_trim_ahead = 0, or the L2ARC device header
+ * is valid with trim_state = VDEV_TRIM_COMPLETE and
+ * l2ad_log_entries > 0).
+ */
+ continue;
+ }
+
+ mutex_enter(&vd->vdev_trim_lock);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_trim_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+ vd->vdev_trim_thread = thread_create(NULL, 0,
+ vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&vd->vdev_trim_lock);
+ }
+}
+
+/*
+ * A wrapper which calls vdev_trim_ranges(). It is intended to be called
+ * on leaf vdevs.
+ */
+int
+vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
+{
+ trim_args_t ta;
+ range_seg64_t physical_rs;
+ int error;
+ physical_rs.rs_start = start;
+ physical_rs.rs_end = start + size;
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ bzero(&ta, sizeof (ta));
+ ta.trim_vdev = vd;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_SIMPLE;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ ta.trim_flags = 0;
+
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(ta.trim_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+
+ error = vdev_trim_ranges(&ta);
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ range_tree_destroy(ta.trim_tree);
+
+ return (error);
+}
+
EXPORT_SYMBOL(vdev_trim);
EXPORT_SYMBOL(vdev_trim_stop);
EXPORT_SYMBOL(vdev_trim_stop_all);
@@ -1446,6 +1679,8 @@ EXPORT_SYMBOL(vdev_autotrim);
EXPORT_SYMBOL(vdev_autotrim_stop_all);
EXPORT_SYMBOL(vdev_autotrim_stop_wait);
EXPORT_SYMBOL(vdev_autotrim_restart);
+EXPORT_SYMBOL(vdev_trim_l2arc);
+EXPORT_SYMBOL(vdev_trim_simple);
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,