summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeorge Amanakis <[email protected]>2020-06-09 13:15:08 -0400
committerGitHub <[email protected]>2020-06-09 10:15:08 -0700
commitb7654bd7940618f1b02835d565e04920c8c4403f (patch)
treea346410e1da29cedeb9663447a59abf2dd4844ec
parent32f26eaa70fe9e8aea79311123879f885f674d45 (diff)
Trim L2ARC
The l2arc_evict() function is responsible for evicting buffers which reference the next bytes of the L2ARC device to be overwritten. Teach this function to additionally TRIM that vdev space before it is overwritten if the device has been filled with data. This is done by vdev_trim_simple() which trims by issuing a new type of TRIM, TRIM_TYPE_SIMPLE. We also implement a "Trim Ahead" feature. It is a zfs module parameter, expressed in % of the current write size. This trims ahead of the current write size. A minimum of 64MB will be trimmed. The default is 0 which disables TRIM on L2ARC as it can put significant stress to underlying storage devices. To enable TRIM on L2ARC we set l2arc_trim_ahead > 0. We also implement TRIM of the whole cache device upon addition to a pool, pool creation or when the header of the device is invalid upon importing a pool or onlining a cache device. This is dependent on l2arc_trim_ahead > 0. TRIM of the whole device is done with TRIM_TYPE_MANUAL so that its status can be monitored by zpool status -t. We save the TRIM state for the whole device and the time of completion on-disk in the header, and restore these upon L2ARC rebuild so that zpool status -t can correctly report them. Whole device TRIM is done asynchronously so that the user can export of the pool or remove the cache device while it is trimming (ie if it is too slow). We do not TRIM the whole device if persistent L2ARC has been disabled by l2arc_rebuild_enabled = 0 because we may not want to lose all cached buffers (eg we may want to import the pool with l2arc_rebuild_enabled = 0 only once because of memory pressure). If persistent L2ARC has been disabled by setting the module parameter l2arc_rebuild_blocks_min_l2size to a value greater than the size of the cache device then the whole device is trimmed upon creation or import of a pool if l2arc_trim_ahead > 0. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Adam D. Moss <[email protected]> Signed-off-by: George Amanakis <[email protected]> Closes #9713 Closes #9789 Closes #10224
-rw-r--r--cmd/zdb/zdb.c6
-rw-r--r--include/sys/arc_impl.h14
-rw-r--r--include/sys/spa.h8
-rw-r--r--include/sys/vdev_impl.h2
-rw-r--r--include/sys/vdev_trim.h2
-rw-r--r--man/man5/zfs-module-parameters.522
-rw-r--r--man/man8/zpoolprops.82
-rw-r--r--module/os/linux/zfs/spa_stats.c15
-rw-r--r--module/zfs/arc.c141
-rw-r--r--module/zfs/spa.c20
-rw-r--r--module/zfs/vdev.c13
-rw-r--r--module/zfs/vdev_removal.c14
-rw-r--r--module/zfs/vdev_trim.c245
-rw-r--r--tests/runfiles/common.run2
-rw-r--r--tests/zfs-tests/include/tunables.cfg1
-rw-r--r--tests/zfs-tests/tests/functional/trim/Makefile.am3
-rw-r--r--tests/zfs-tests/tests/functional/trim/trim.kshlib8
-rwxr-xr-xtests/zfs-tests/tests/functional/trim/trim_l2arc.ksh106
18 files changed, 573 insertions, 51 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 00258799b..763a086ac 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -3707,8 +3707,12 @@ dump_l2arc_header(int fd)
(u_longlong_t)l2dhdr.dh_evict);
(void) printf(" lb_asize_refcount: %llu\n",
(u_longlong_t)l2dhdr.dh_lb_asize);
- (void) printf(" lb_count_refcount: %llu\n\n",
+ (void) printf(" lb_count_refcount: %llu\n",
(u_longlong_t)l2dhdr.dh_lb_count);
+ (void) printf(" trim_action_time: %llu\n",
+ (u_longlong_t)l2dhdr.dh_trim_action_time);
+ (void) printf(" trim_state: %llu\n\n",
+ (u_longlong_t)l2dhdr.dh_trim_state);
}
dump_l2arc_log_blocks(fd, l2dhdr, &rebuild);
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index e8c944ce8..5724db0a5 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -240,7 +240,14 @@ typedef struct l2arc_dev_hdr_phys {
*/
uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
- const uint64_t dh_pad[32]; /* pad to 512 bytes */
+ /*
+ * Mirrors of vdev_trim_action_time and vdev_trim_state, used to
+ * display when the cache device was fully trimmed for the last
+ * time.
+ */
+ uint64_t dh_trim_action_time;
+ uint64_t dh_trim_state;
+ const uint64_t dh_pad[30]; /* pad to 512 bytes */
zio_eck_t dh_tail;
} l2arc_dev_hdr_phys_t;
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
@@ -399,6 +406,7 @@ typedef struct l2arc_dev {
* Number of log blocks present on the device.
*/
zfs_refcount_t l2ad_lb_count;
+ boolean_t l2ad_trim_all; /* TRIM whole device */
} l2arc_dev_t;
/*
@@ -902,6 +910,10 @@ extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
const l2arc_log_blkptr_t *lbp);
+/* used in vdev_trim.c */
+void l2arc_dev_hdr_update(l2arc_dev_t *dev);
+l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/spa.h b/include/sys/spa.h
index a03319e2b..5806dda41 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -745,6 +745,7 @@ typedef enum {
typedef enum trim_type {
TRIM_TYPE_MANUAL = 0,
TRIM_TYPE_AUTO = 1,
+ TRIM_TYPE_SIMPLE = 2
} trim_type_t;
/* state manipulation functions */
@@ -788,6 +789,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
#define SPA_ASYNC_L2CACHE_REBUILD 0x800
+#define SPA_ASYNC_L2CACHE_TRIM 0x1000
/*
* Controls the behavior of spa_vdev_remove().
@@ -940,6 +942,12 @@ typedef struct spa_iostats {
kstat_named_t autotrim_bytes_skipped;
kstat_named_t autotrim_extents_failed;
kstat_named_t autotrim_bytes_failed;
+ kstat_named_t simple_trim_extents_written;
+ kstat_named_t simple_trim_bytes_written;
+ kstat_named_t simple_trim_extents_skipped;
+ kstat_named_t simple_trim_bytes_skipped;
+ kstat_named_t simple_trim_extents_failed;
+ kstat_named_t simple_trim_bytes_failed;
} spa_iostats_t;
extern void spa_stats_init(spa_t *spa);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 96546ac35..56407a191 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -301,7 +301,7 @@ struct vdev {
uint64_t vdev_initialize_inflight;
kmutex_t vdev_trim_io_lock;
kcondvar_t vdev_trim_io_cv;
- uint64_t vdev_trim_inflight[2];
+ uint64_t vdev_trim_inflight[3];
/*
* Values stored in the config for an indirect or removing vdev.
diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h
index 1e5401766..16f4be2a4 100644
--- a/include/sys/vdev_trim.h
+++ b/include/sys/vdev_trim.h
@@ -44,6 +44,8 @@ extern void vdev_autotrim(spa_t *spa);
extern void vdev_autotrim_stop_all(spa_t *spa);
extern void vdev_autotrim_stop_wait(vdev_t *vd);
extern void vdev_autotrim_restart(spa_t *spa);
+extern int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size);
+extern void vdev_trim_l2arc(spa_t *spa);
#ifdef __cplusplus
}
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 40666c8f3..7ef82d9a0 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -194,7 +194,8 @@ Default value: \fB2\fR.
.ad
.RS 12n
Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being
-successfully compressed before writing. A value of 100 disables this feature.
+successfully compressed before writing. A value of \fB100\fR disables this
+feature.
.sp
Default value: \fB200\fR%.
.RE
@@ -202,6 +203,25 @@ Default value: \fB200\fR%.
.sp
.ne 2
.na
+\fBl2arc_trim_ahead\fR (ulong)
+.ad
+.RS 12n
+Trims ahead of the current write size (\fBl2arc_write_max\fR) on L2ARC devices
+by this percentage of write size if we have filled the device. If set to
+\fB100\fR we TRIM twice the space required to accommodate upcoming writes. A
+minimum of 64MB will be trimmed. It also enables TRIM of the whole L2ARC device
+upon creation or addition to an existing pool or if the header of the device is
+invalid upon importing a pool or onlining a cache device. A value of \fB0\fR
+disables TRIM on L2ARC altogether and is the default as it can put significant
+stress on the underlying storage devices. This will vary depending of how well
+the specific device handles these commands.
+.sp
+Default value: \fB0\fR%.
+.RE
+
+.sp
+.ne 2
+.na
\fBl2arc_noprefetch\fR (int)
.ad
.RS 12n
diff --git a/man/man8/zpoolprops.8 b/man/man8/zpoolprops.8
index f0522ef78..d85b6d436 100644
--- a/man/man8/zpoolprops.8
+++ b/man/man8/zpoolprops.8
@@ -238,6 +238,8 @@ this property is
Automatic TRIM does not immediately reclaim blocks after a free. Instead,
it will optimistically delay allowing smaller ranges to be aggregated in to
a few larger ones. These can then be issued more efficiently to the storage.
+TRIM on L2ARC devices is enabled by setting
+.Sy l2arc_trim_ahead > 0 .
.Pp
Be aware that automatic trimming of recently freed data blocks can put
significant stress on the underlying storage devices. This will vary
diff --git a/module/os/linux/zfs/spa_stats.c b/module/os/linux/zfs/spa_stats.c
index eae9c3f22..2ec32da46 100644
--- a/module/os/linux/zfs/spa_stats.c
+++ b/module/os/linux/zfs/spa_stats.c
@@ -903,6 +903,12 @@ static spa_iostats_t spa_iostats_template = {
{ "autotrim_bytes_skipped", KSTAT_DATA_UINT64 },
{ "autotrim_extents_failed", KSTAT_DATA_UINT64 },
{ "autotrim_bytes_failed", KSTAT_DATA_UINT64 },
+ { "simple_trim_extents_written", KSTAT_DATA_UINT64 },
+ { "simple_trim_bytes_written", KSTAT_DATA_UINT64 },
+ { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 },
+ { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 },
+ { "simple_trim_extents_failed", KSTAT_DATA_UINT64 },
+ { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 },
};
#define SPA_IOSTATS_ADD(stat, val) \
@@ -929,13 +935,20 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
- } else {
+ } else if (type == TRIM_TYPE_AUTO) {
SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+ } else {
+ SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
+ SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
+ SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
+ SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
+ SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
+ SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
}
}
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 29da08a49..e7ad976af 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -301,6 +301,7 @@
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
#include <cityhash.h>
+#include <sys/vdev_trim.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@@ -854,7 +855,6 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
-static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -865,6 +865,23 @@ static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
/*
+ * L2ARC TRIM
+ * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
+ * the current write size (l2arc_write_max) we should TRIM if we
+ * have filled the device. It is defined as a percentage of the
+ * write size. If set to 100 we trim twice the space required to
+ * accommodate upcoming writes. A minimum of 64MB will be trimmed.
+ * It also enables TRIM of the whole L2ARC device upon creation or
+ * addition to an existing pool or if the header of the device is
+ * invalid upon importing a pool or onlining a cache device. The
+ * default is 0, which disables TRIM on L2ARC altogether as it can
+ * put significant stress on the underlying storage devices. This
+ * will vary depending of how well the specific device handles
+ * these commands.
+ */
+unsigned long l2arc_trim_ahead = 0;
+
+/*
* Performance tuning of L2ARC persistence:
*
* l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
@@ -902,7 +919,6 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
-static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
@@ -7709,7 +7725,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size, dev_size;
+ uint64_t size, dev_size, tsize;
/*
* Make sure our globals have meaningful values in case the user
@@ -7732,7 +7748,12 @@ l2arc_write_size(l2arc_dev_t *dev)
* iteration can occur.
*/
dev_size = dev->l2ad_end - dev->l2ad_start;
- if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
+ tsize = size + l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
+ tsize += MAX(64 * 1024 * 1024,
+ (tsize * l2arc_trim_ahead) / 100);
+
+ if (tsize >= dev_size) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
@@ -7810,10 +7831,12 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all)
next = NULL;
l2arc_dev_last = next;
@@ -8336,8 +8359,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
- boolean_t rerun;
l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+ vdev_t *vd = dev->l2ad_vdev;
+ boolean_t rerun;
buflist = &dev->l2ad_buflist;
@@ -8345,6 +8369,14 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
* We need to add in the worst case scenario of log block overhead.
*/
distance += l2arc_log_blk_overhead(distance, dev);
+ if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
+ /*
+ * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+ * times the write size, whichever is greater.
+ */
+ distance += MAX(64 * 1024 * 1024,
+ (distance * l2arc_trim_ahead) / 100);
+ }
top:
rerun = B_FALSE;
@@ -8365,25 +8397,51 @@ top:
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
- /*
- * This check has to be placed after deciding whether to iterate
- * (rerun).
- */
- if (!all && dev->l2ad_first) {
+ if (!all) {
/*
- * This is the first sweep through the device. There is
- * nothing to evict.
+ * This check has to be placed after deciding whether to
+ * iterate (rerun).
*/
- goto out;
- }
+ if (dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict. We have already trimmmed the
+ * whole device.
+ */
+ goto out;
+ } else {
+ /*
+ * Trim the space to be evicted.
+ */
+ if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
+ l2arc_trim_ahead > 0) {
+ /*
+ * We have to drop the spa_config lock because
+ * vdev_trim_range() will acquire it.
+ * l2ad_evict already accounts for the label
+ * size. To prevent vdev_trim_ranges() from
+ * adding it again, we subtract it from
+ * l2ad_evict.
+ */
+ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
+ vdev_trim_simple(vd,
+ dev->l2ad_evict - VDEV_LABEL_START_SIZE,
+ taddr - dev->l2ad_evict);
+ spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
+ RW_READER);
+ }
- /*
- * When rebuilding L2ARC we retrieve the evict hand from the header of
- * the device. Of note, l2arc_evict() does not actually delete buffers
- * from the cache device, but keeping track of the evict hand will be
- * useful when TRIM is implemented.
- */
- dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand
+ * from the header of the device. Of note, l2arc_evict()
+ * does not actually delete buffers from the cache
+ * device, but trimming may do so depending on the
+ * hardware implementation. Thus keeping track of the
+ * evict hand is useful.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ }
+ }
retry:
mutex_enter(&dev->l2ad_mtx);
@@ -8410,7 +8468,7 @@ retry:
if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
break;
} else {
- vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+ vdev_space_update(vd, -asize, 0, 0);
ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
@@ -9015,7 +9073,7 @@ l2arc_vdev_present(vdev_t *vd)
* Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
* the vdev_t isn't an L2ARC device.
*/
-static l2arc_dev_t *
+l2arc_dev_t *
l2arc_vdev_get(vdev_t *vd)
{
l2arc_dev_t *dev;
@@ -9059,6 +9117,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ adddev->l2ad_trim_all = B_FALSE;
list_link_init(&adddev->l2ad_node);
adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
@@ -9164,11 +9223,21 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
dev->l2ad_rebuild = B_TRUE;
} else if (spa_writeable(spa)) {
/*
- * In this case create a new header. We zero out the memory
- * holding the header to reset dh_start_lbps.
+ * In this case TRIM the whole device if l2arc_trim_ahead > 0,
+ * otherwise create a new header. We zero out the memory holding
+ * the header to reset dh_start_lbps. If we TRIM the whole
+ * device the new header will be written by
+ * vdev_trim_l2arc_thread() at the end of the TRIM to update the
+ * trim_state in the header too. When reading the header, if
+ * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
+ * we opt to TRIM the whole device again.
*/
- bzero(l2dhdr, l2dhdr_asize);
- l2arc_dev_hdr_update(dev);
+ if (l2arc_trim_ahead > 0) {
+ dev->l2ad_trim_all = B_TRUE;
+ } else {
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
}
}
@@ -9385,6 +9454,9 @@ l2arc_rebuild(l2arc_dev_t *dev)
dev->l2ad_start);
dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+ vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
+ vd->vdev_trim_state = l2dhdr->dh_trim_state;
+
/*
* In case the zfs module parameter l2arc_rebuild_enabled is false
* we do not start the rebuild process.
@@ -9594,7 +9666,9 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
l2dhdr->dh_end != dev->l2ad_end ||
!l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
- l2dhdr->dh_evict)) {
+ l2dhdr->dh_evict) ||
+ (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
+ l2arc_trim_ahead > 0)) {
/*
* Attempt to rebuild a device containing no actual dev hdr
* or containing a header from some other pool or from another
@@ -9903,7 +9977,7 @@ l2arc_log_blk_fetch_abort(zio_t *zio)
* Creates a zio to update the device header on an l2arc device. The zio is
* initiated as a child of `pio'.
*/
-static void
+void
l2arc_dev_hdr_update(l2arc_dev_t *dev)
{
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
@@ -9924,6 +9998,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
l2dhdr->dh_flags = 0;
+ l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
+ l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
if (dev->l2ad_first)
l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
@@ -10260,6 +10336,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
"Compressed l2arc_headroom multiplier");
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+ "TRIM ahead L2ARC write size multiplier");
+
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
"Seconds between L2ARC writing");
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 87af3073a..ba9a0dce3 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1896,6 +1896,15 @@ spa_load_l2cache(spa_t *spa)
if (!vdev_is_dead(vd))
l2arc_add_vdev(spa, vd);
+
+ /*
+ * Upon cache device addition to a pool or pool
+ * creation with a cache device or if the header
+ * of the device is invalid we issue an async
+ * TRIM command for the whole device which will
+ * execute if l2arc_trim_ahead > 0.
+ */
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
}
}
@@ -7994,6 +8003,17 @@ spa_async_thread(void *arg)
}
/*
+ * Kick off L2 cache whole device TRIM.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_trim_l2arc(spa);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
* Kick off L2 cache rebuilding.
*/
if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 923bf2e33..83c39d119 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2281,9 +2281,6 @@ vdev_reopen(vdev_t *vd)
if (vdev_readable(vd) && vdev_writeable(vd) &&
vd->vdev_aux == &spa->spa_l2cache) {
/*
- * When reopening we can assume the device label has
- * already the attribute l2cache_persistent, since we've
- * opened the device in the past and updated the label.
* In case the vdev is present we should evict all ARC
* buffers and pointers to log blocks and reclaim their
* space before restoring its contents to L2ARC.
@@ -2294,6 +2291,7 @@ vdev_reopen(vdev_t *vd)
l2arc_add_vdev(spa, vd);
}
spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
}
} else {
(void) vdev_validate(vd);
@@ -3542,9 +3540,14 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
}
mutex_exit(&vd->vdev_initialize_lock);
- /* Restart trimming if necessary */
+ /*
+ * Restart trimming if necessary. We do not restart trimming for cache
+ * devices here. This is triggered by l2arc_rebuild_vdev()
+ * asynchronously for the whole device or in l2arc_evict() as it evicts
+ * space for upcoming writes.
+ */
mutex_enter(&vd->vdev_trim_lock);
- if (vdev_writeable(vd) &&
+ if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
vd->vdev_trim_thread == NULL &&
vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 3f4f9091f..56e420871 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -2224,6 +2224,20 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
* Cache devices can always be removed.
*/
vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+
+ /*
+ * Stop trimming the cache device. We need to release the
+ * config lock to allow the syncing of TRIM transactions
+ * without releasing the spa_namespace_lock. The same
+ * strategy is employed in spa_vdev_remove_top().
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+ mutex_exit(&vd->vdev_trim_lock);
+ txg = spa_vdev_config_enter(spa);
+
ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index b0cd40f68..0254c2904 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -34,6 +34,7 @@
#include <sys/dsl_synctask.h>
#include <sys/zap.h>
#include <sys/dmu_tx.h>
+#include <sys/arc_impl.h>
/*
* TRIM is a feature which is used to notify a SSD that some previously
@@ -423,6 +424,35 @@ vdev_autotrim_cb(zio_t *zio)
}
/*
+ * The zio_done_func_t done callback for each TRIM issued via
+ * vdev_trim_simple(). It is responsible for updating the TRIM stats and
+ * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best
+ * effort and are never reissued on failure.
+ */
+static void
+vdev_trim_simple_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+
+ if (zio->io_error != 0) {
+ vd->vdev_stat.vs_trim_errors++;
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+ 0, 0, 0, 0, 1, zio->io_orig_size);
+ } else {
+ spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+ 1, zio->io_orig_size, 0, 0, 0, 0);
+ }
+
+ ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0);
+ vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--;
+ cv_broadcast(&vd->vdev_trim_io_cv);
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+/*
* Returns the average trim rate in bytes/sec for the ta->trim_vdev.
*/
static uint64_t
@@ -441,6 +471,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
{
vdev_t *vd = ta->trim_vdev;
spa_t *spa = vd->vdev_spa;
+ void *cb;
mutex_enter(&vd->vdev_trim_io_lock);
@@ -459,8 +490,8 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
ta->trim_bytes_done += size;
/* Limit in flight trimming I/Os */
- while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >=
- zfs_trim_queue_limit) {
+ while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] +
+ vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) {
cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
}
vd->vdev_trim_inflight[ta->trim_type]++;
@@ -505,10 +536,17 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
if (ta->trim_type == TRIM_TYPE_MANUAL)
vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
+ if (ta->trim_type == TRIM_TYPE_MANUAL) {
+ cb = vdev_trim_cb;
+ } else if (ta->trim_type == TRIM_TYPE_AUTO) {
+ cb = vdev_autotrim_cb;
+ } else {
+ cb = vdev_trim_simple_cb;
+ }
+
zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
- start, size, ta->trim_type == TRIM_TYPE_MANUAL ?
- vdev_trim_cb : vdev_autotrim_cb, NULL,
- ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags));
+ start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL,
+ ta->trim_flags));
/* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
dmu_tx_commit(tx);
@@ -1016,6 +1054,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
{
spa_t *spa = vd->vdev_spa;
list_t vd_list;
+ vdev_t *vd_l2cache;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
@@ -1023,6 +1062,17 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
offsetof(vdev_t, vdev_trim_node));
vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
+
+ /*
+ * Iterate over cache devices and request stop trimming the
+ * whole device in case we export the pool or remove the cache
+ * device prematurely.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vd_l2cache = spa->spa_l2cache.sav_vdevs[i];
+ vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list);
+ }
+
vdev_trim_stop_wait(spa, &vd_list);
if (vd->vdev_spa->spa_sync_on) {
@@ -1437,6 +1487,189 @@ vdev_autotrim_restart(spa_t *spa)
vdev_autotrim(spa);
}
+static void
+vdev_trim_l2arc_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ l2arc_dev_t *dev = l2arc_vdev_get(vd);
+ trim_args_t ta;
+ range_seg64_t physical_rs;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_trim_last_offset = 0;
+ vd->vdev_trim_rate = 0;
+ vd->vdev_trim_partial = 0;
+ vd->vdev_trim_secure = 0;
+
+ bzero(&ta, sizeof (ta));
+ ta.trim_vdev = vd;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_MANUAL;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ ta.trim_flags = 0;
+
+ physical_rs.rs_start = vd->vdev_trim_bytes_done = 0;
+ physical_rs.rs_end = vd->vdev_trim_bytes_est =
+ vdev_get_min_asize(vd);
+
+ range_tree_add(ta.trim_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ (void) vdev_trim_ranges(&ta);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ range_tree_destroy(ta.trim_tree);
+
+ mutex_enter(&vd->vdev_trim_lock);
+ if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+ vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+ vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ ASSERT(vd->vdev_trim_thread != NULL ||
+ vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0);
+
+ /*
+ * Drop the vdev_trim_lock while we sync out the txg since it's
+ * possible that a device might be trying to come online and
+ * must check to see if it needs to restart a trim. That thread
+ * will be holding the spa_config_lock which would prevent the
+ * txg_wait_synced from completing. Same strategy as in
+ * vdev_trim_thread().
+ */
+ mutex_exit(&vd->vdev_trim_lock);
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ mutex_enter(&vd->vdev_trim_lock);
+
+ /*
+ * Update the header of the cache device here, before
+ * broadcasting vdev_trim_cv which may lead to the removal
+ * of the device. The same applies for setting l2ad_trim_all to
+ * false.
+ */
+ spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
+ RW_READER);
+ bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
+ l2arc_dev_hdr_update(dev);
+ spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
+
+ vd->vdev_trim_thread = NULL;
+ if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE)
+ dev->l2ad_trim_all = B_FALSE;
+
+ cv_broadcast(&vd->vdev_trim_cv);
+ mutex_exit(&vd->vdev_trim_lock);
+
+ thread_exit();
+}
+
+/*
+ * Punches out TRIM threads for the L2ARC devices in a spa and assigns them
+ * to vd->vdev_trim_thread variable. This facilitates the management of
+ * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition
+ * to a pool or pool creation or when the header of the device is invalid.
+ */
+void
+vdev_trim_l2arc(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off TRIM threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vdev_t *vd = spa->spa_l2cache.sav_vdevs[i];
+ l2arc_dev_t *dev = l2arc_vdev_get(vd);
+
+ if (dev == NULL || !dev->l2ad_trim_all) {
+ /*
+ * Don't attempt TRIM if the vdev is UNAVAIL or if the
+ * cache device was not marked for whole device TRIM
+ * (ie l2arc_trim_ahead = 0, or the L2ARC device header
+ * is valid with trim_state = VDEV_TRIM_COMPLETE and
+ * l2ad_log_entries > 0).
+ */
+ continue;
+ }
+
+ mutex_enter(&vd->vdev_trim_lock);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_trim_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+ vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+ vd->vdev_trim_thread = thread_create(NULL, 0,
+ vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&vd->vdev_trim_lock);
+ }
+}
+
+/*
+ * A wrapper which calls vdev_trim_ranges(). It is intended to be called
+ * on leaf vdevs.
+ */
+int
+vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
+{
+ trim_args_t ta;
+ range_seg64_t physical_rs;
+ int error;
+ physical_rs.rs_start = start;
+ physical_rs.rs_end = start + size;
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ bzero(&ta, sizeof (ta));
+ ta.trim_vdev = vd;
+ ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ ta.trim_type = TRIM_TYPE_SIMPLE;
+ ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+ ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+ ta.trim_flags = 0;
+
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(ta.trim_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+
+ error = vdev_trim_ranges(&ta);
+
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ range_tree_vacate(ta.trim_tree, NULL, NULL);
+ range_tree_destroy(ta.trim_tree);
+
+ return (error);
+}
+
EXPORT_SYMBOL(vdev_trim);
EXPORT_SYMBOL(vdev_trim_stop);
EXPORT_SYMBOL(vdev_trim_stop_all);
@@ -1446,6 +1679,8 @@ EXPORT_SYMBOL(vdev_autotrim);
EXPORT_SYMBOL(vdev_autotrim_stop_all);
EXPORT_SYMBOL(vdev_autotrim_stop_wait);
EXPORT_SYMBOL(vdev_autotrim_restart);
+EXPORT_SYMBOL(vdev_trim_l2arc);
+EXPORT_SYMBOL(vdev_trim_simple);
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 01bab0870..cbad90ad1 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -832,7 +832,7 @@ tags = ['functional', 'threadsappend']
[tests/functional/trim]
tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity',
- 'trim_integrity', 'trim_config']
+ 'trim_integrity', 'trim_config', 'trim_l2arc']
tags = ['functional', 'trim']
[tests/functional/truncate]
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index efbcc09e7..c450764db 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -38,6 +38,7 @@ KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps
L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch
L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size
L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled
+L2ARC_TRIM_AHEAD UNSUPPORTED l2arc_trim_ahead
L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost
L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max
LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc
diff --git a/tests/zfs-tests/tests/functional/trim/Makefile.am b/tests/zfs-tests/tests/functional/trim/Makefile.am
index 4f260a8e4..8917ed726 100644
--- a/tests/zfs-tests/tests/functional/trim/Makefile.am
+++ b/tests/zfs-tests/tests/functional/trim/Makefile.am
@@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \
autotrim_config.ksh \
autotrim_trim_integrity.ksh \
trim_integrity.ksh \
- trim_config.ksh
+ trim_config.ksh \
+ trim_l2arc.ksh
diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib
index 7f1bcdacf..bede946a0 100644
--- a/tests/zfs-tests/tests/functional/trim/trim.kshlib
+++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib
@@ -33,17 +33,18 @@ function get_trim_io
{
typeset pool="${1-:$TESTPOOL}"
typeset type="${2-:ind}"
+ typeset vdev="${3}"
typeset rval
# Sum the ind or agg columns of the trim request size histogram.
case "$type" in
"ind")
- rval=$(zpool iostat -pr $pool | awk \
+ rval=$(zpool iostat -pr $pool $vdev | awk \
'$1 ~ /[0-9].*/ { sum += $12 } END { print sum }')
echo -n "$rval"
;;
"agg")
- rval=$(zpool iostat -pr $pool | awk \
+ rval=$(zpool iostat -pr $pool $vdev | awk \
'$1 ~ /[0-9].*/ { sum += $13 } END { print sum }')
echo -n "$rval"
;;
@@ -61,9 +62,10 @@ function verify_trim_io
typeset pool="${1:-$TESTPOOL}"
typeset type="${2:-ind}"
typeset min_trim_ios=${3:-100}
+ typeset vdev="${4}"
typeset ios
- ios=$(get_trim_io $pool $type)
+ ios=$(get_trim_io $pool $type $vdev)
if [[ $ios -ge $min_trim_ios ]]; then
log_note "Issued $ios $type trim IOs for pool $pool"
else
diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh
new file mode 100755
index 000000000..ecf9f3424
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh
@@ -0,0 +1,106 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/trim/trim.kshlib
+. $STF_SUITE/tests/functional/trim/trim.cfg
+
+#
+# DESCRIPTION:
+# Verify trimming of L2ARC
+#
+# STRATEGY:
+# 1. Set 'l2arc_trim_ahead = 1' and `l2arc_write_size = 64MB`.
+# 2. Create a pool on file vdevs to trim.
+# 3. Verify the cache device was trimmed.
+# 4. Fill the pool with a file larger than the L2ARC vdev.
+# 5. Randomly read the previous written file long enough for the
+# L2ARC vdev to be filled and overwritten 5 times.
+# 6. Verify trim IOs of the expected type were issued for the pool.
+# 7. Verify the allocated space on the cache device is less than
+# its size.
+#
+
+verify_runnable "global"
+
+log_assert "Trim of L2ARC succeeds."
+
+function cleanup
+{
+ if poolexists $TESTPOOL; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must rm -f $VDEVS
+ log_must set_tunable32 L2ARC_TRIM_AHEAD $l2arc_trimahead
+ log_must set_tunable32 L2ARC_WRITE_MAX $l2arc_writemax
+}
+log_onexit cleanup
+
+# The cache device $TRIM_VDEV2 has to be small enough, so that
+# dev->l2ad_hand loops around and dev->l2ad_first=0. Otherwise
+# l2arc_evict() exits before evicting/trimming.
+typeset l2arc_trimahead=$(get_tunable L2ARC_TRIM_AHEAD)
+typeset l2arc_writemax=$(get_tunable L2ARC_WRITE_MAX)
+log_must set_tunable32 L2ARC_TRIM_AHEAD 1
+log_must set_tunable32 L2ARC_WRITE_MAX $((64 * 1024 * 1024))
+VDEVS="$TRIM_VDEV1 $TRIM_VDEV2"
+log_must truncate -s $((MINVDEVSIZE)) $TRIM_VDEV2
+log_must truncate -s $((4 * MINVDEVSIZE)) $TRIM_VDEV1
+typeset VDEV_MIN_MB=$((MINVDEVSIZE * 0.30 / 1024 / 1024))
+
+log_must zpool create -f $TESTPOOL $TRIM_VDEV1 cache $TRIM_VDEV2
+verify_vdevs "-le" "$VDEV_MIN_MB" $TRIM_VDEV2
+
+typeset fill_mb=$(( floor(2 * MINVDEVSIZE) ))
+export DIRECTORY=/$TESTPOOL
+export NUMJOBS=1
+export FILE_SIZE=${fill_mb}
+export PERF_RANDSEED=1234
+export PERF_COMPPERCENT=66
+export PERF_COMPCHUNK=0
+export RUNTIME=30
+export BLOCKSIZE=128K
+export SYNC_TYPE=0
+export DIRECT=1
+
+# Write to the pool.
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+
+# Read randomly from the pool to fill L2ARC.
+export RUNTIME=30
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+export RUNTIME=1
+typeset do_once=true
+while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do
+ typeset l2_size1=$(get_arcstat l2_size)
+ log_must fio $FIO_SCRIPTS/random_reads.fio
+ typeset l2_size2=$(get_arcstat l2_size)
+ do_once=false
+done
+
+verify_trim_io $TESTPOOL "ind" 5 $TRIM_VDEV2
+
+typeset cache_size=$(zpool list -vp | grep $TRIM_VDEV2 | awk '{print $2}')
+typeset cache_alloc=$(zpool list -vp | grep $TRIM_VDEV2 | awk '{print $3}')
+
+log_must test $cache_alloc -lt $cache_size
+
+log_must zpool destroy $TESTPOOL
+log_must rm -f $VDEVS
+
+log_pass "Trim of L2ARC succeeds."