summaryrefslogtreecommitdiffstats
path: root/module/zfs/arc.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r--module/zfs/arc.c141
1 files changed, 110 insertions, 31 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 29da08a49..e7ad976af 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -301,6 +301,7 @@
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
#include <cityhash.h>
+#include <sys/vdev_trim.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@@ -854,7 +855,6 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
-static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -865,6 +865,23 @@ static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
/*
+ * L2ARC TRIM
+ * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
+ * the current write size (l2arc_write_max) we should TRIM if we
+ * have filled the device. It is defined as a percentage of the
+ * write size. If set to 100 we trim twice the space required to
+ * accommodate upcoming writes. A minimum of 64MB will be trimmed.
+ * It also enables TRIM of the whole L2ARC device upon creation or
+ * addition to an existing pool or if the header of the device is
+ * invalid upon importing a pool or onlining a cache device. The
+ * default is 0, which disables TRIM on L2ARC altogether as it can
+ * put significant stress on the underlying storage devices. This
+ * will vary depending of how well the specific device handles
+ * these commands.
+ */
+unsigned long l2arc_trim_ahead = 0;
+
+/*
* Performance tuning of L2ARC persistence:
*
* l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
@@ -902,7 +919,6 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
-static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
@@ -7709,7 +7725,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size, dev_size;
+ uint64_t size, dev_size, tsize;
/*
* Make sure our globals have meaningful values in case the user
@@ -7732,7 +7748,12 @@ l2arc_write_size(l2arc_dev_t *dev)
* iteration can occur.
*/
dev_size = dev->l2ad_end - dev->l2ad_start;
- if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
+ tsize = size + l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
+ tsize += MAX(64 * 1024 * 1024,
+ (tsize * l2arc_trim_ahead) / 100);
+
+ if (tsize >= dev_size) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
@@ -7810,10 +7831,12 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+ next->l2ad_trim_all)
next = NULL;
l2arc_dev_last = next;
@@ -8336,8 +8359,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
- boolean_t rerun;
l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+ vdev_t *vd = dev->l2ad_vdev;
+ boolean_t rerun;
buflist = &dev->l2ad_buflist;
@@ -8345,6 +8369,14 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
* We need to add in the worst case scenario of log block overhead.
*/
distance += l2arc_log_blk_overhead(distance, dev);
+ if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
+ /*
+ * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+ * times the write size, whichever is greater.
+ */
+ distance += MAX(64 * 1024 * 1024,
+ (distance * l2arc_trim_ahead) / 100);
+ }
top:
rerun = B_FALSE;
@@ -8365,25 +8397,51 @@ top:
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
- /*
- * This check has to be placed after deciding whether to iterate
- * (rerun).
- */
- if (!all && dev->l2ad_first) {
+ if (!all) {
/*
- * This is the first sweep through the device. There is
- * nothing to evict.
+ * This check has to be placed after deciding whether to
+ * iterate (rerun).
*/
- goto out;
- }
+ if (dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict. We have already trimmmed the
+ * whole device.
+ */
+ goto out;
+ } else {
+ /*
+ * Trim the space to be evicted.
+ */
+ if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
+ l2arc_trim_ahead > 0) {
+ /*
+ * We have to drop the spa_config lock because
+ * vdev_trim_range() will acquire it.
+ * l2ad_evict already accounts for the label
+ * size. To prevent vdev_trim_ranges() from
+ * adding it again, we subtract it from
+ * l2ad_evict.
+ */
+ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
+ vdev_trim_simple(vd,
+ dev->l2ad_evict - VDEV_LABEL_START_SIZE,
+ taddr - dev->l2ad_evict);
+ spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
+ RW_READER);
+ }
- /*
- * When rebuilding L2ARC we retrieve the evict hand from the header of
- * the device. Of note, l2arc_evict() does not actually delete buffers
- * from the cache device, but keeping track of the evict hand will be
- * useful when TRIM is implemented.
- */
- dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand
+ * from the header of the device. Of note, l2arc_evict()
+ * does not actually delete buffers from the cache
+ * device, but trimming may do so depending on the
+ * hardware implementation. Thus keeping track of the
+ * evict hand is useful.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+ }
+ }
retry:
mutex_enter(&dev->l2ad_mtx);
@@ -8410,7 +8468,7 @@ retry:
if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
break;
} else {
- vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+ vdev_space_update(vd, -asize, 0, 0);
ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
@@ -9015,7 +9073,7 @@ l2arc_vdev_present(vdev_t *vd)
* Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
* the vdev_t isn't an L2ARC device.
*/
-static l2arc_dev_t *
+l2arc_dev_t *
l2arc_vdev_get(vdev_t *vd)
{
l2arc_dev_t *dev;
@@ -9059,6 +9117,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ adddev->l2ad_trim_all = B_FALSE;
list_link_init(&adddev->l2ad_node);
adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
@@ -9164,11 +9223,21 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
dev->l2ad_rebuild = B_TRUE;
} else if (spa_writeable(spa)) {
/*
- * In this case create a new header. We zero out the memory
- * holding the header to reset dh_start_lbps.
+ * In this case TRIM the whole device if l2arc_trim_ahead > 0,
+ * otherwise create a new header. We zero out the memory holding
+ * the header to reset dh_start_lbps. If we TRIM the whole
+ * device the new header will be written by
+ * vdev_trim_l2arc_thread() at the end of the TRIM to update the
+ * trim_state in the header too. When reading the header, if
+ * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
+ * we opt to TRIM the whole device again.
*/
- bzero(l2dhdr, l2dhdr_asize);
- l2arc_dev_hdr_update(dev);
+ if (l2arc_trim_ahead > 0) {
+ dev->l2ad_trim_all = B_TRUE;
+ } else {
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
}
}
@@ -9385,6 +9454,9 @@ l2arc_rebuild(l2arc_dev_t *dev)
dev->l2ad_start);
dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+ vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
+ vd->vdev_trim_state = l2dhdr->dh_trim_state;
+
/*
* In case the zfs module parameter l2arc_rebuild_enabled is false
* we do not start the rebuild process.
@@ -9594,7 +9666,9 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
l2dhdr->dh_end != dev->l2ad_end ||
!l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
- l2dhdr->dh_evict)) {
+ l2dhdr->dh_evict) ||
+ (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
+ l2arc_trim_ahead > 0)) {
/*
* Attempt to rebuild a device containing no actual dev hdr
* or containing a header from some other pool or from another
@@ -9903,7 +9977,7 @@ l2arc_log_blk_fetch_abort(zio_t *zio)
* Creates a zio to update the device header on an l2arc device. The zio is
* initiated as a child of `pio'.
*/
-static void
+void
l2arc_dev_hdr_update(l2arc_dev_t *dev)
{
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
@@ -9924,6 +9998,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
l2dhdr->dh_flags = 0;
+ l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
+ l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
if (dev->l2ad_first)
l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
@@ -10260,6 +10336,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
"Compressed l2arc_headroom multiplier");
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+ "TRIM ahead L2ARC write size multiplier");
+
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
"Seconds between L2ARC writing");