From b7654bd7940618f1b02835d565e04920c8c4403f Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 9 Jun 2020 13:15:08 -0400 Subject: Trim L2ARC The l2arc_evict() function is responsible for evicting buffers which reference the next bytes of the L2ARC device to be overwritten. Teach this function to additionally TRIM that vdev space before it is overwritten if the device has been filled with data. This is done by vdev_trim_simple() which trims by issuing a new type of TRIM, TRIM_TYPE_SIMPLE. We also implement a "Trim Ahead" feature. It is a zfs module parameter, expressed in % of the current write size. This trims ahead of the current write size. A minimum of 64MB will be trimmed. The default is 0 which disables TRIM on L2ARC as it can put significant stress to underlying storage devices. To enable TRIM on L2ARC we set l2arc_trim_ahead > 0. We also implement TRIM of the whole cache device upon addition to a pool, pool creation or when the header of the device is invalid upon importing a pool or onlining a cache device. This is dependent on l2arc_trim_ahead > 0. TRIM of the whole device is done with TRIM_TYPE_MANUAL so that its status can be monitored by zpool status -t. We save the TRIM state for the whole device and the time of completion on-disk in the header, and restore these upon L2ARC rebuild so that zpool status -t can correctly report them. Whole device TRIM is done asynchronously so that the user can export of the pool or remove the cache device while it is trimming (ie if it is too slow). We do not TRIM the whole device if persistent L2ARC has been disabled by l2arc_rebuild_enabled = 0 because we may not want to lose all cached buffers (eg we may want to import the pool with l2arc_rebuild_enabled = 0 only once because of memory pressure). If persistent L2ARC has been disabled by setting the module parameter l2arc_rebuild_blocks_min_l2size to a value greater than the size of the cache device then the whole device is trimmed upon creation or import of a pool if l2arc_trim_ahead > 0. Reviewed-by: Brian Behlendorf Reviewed-by: Adam D. Moss Signed-off-by: George Amanakis Closes #9713 Closes #9789 Closes #10224 --- module/zfs/arc.c | 141 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 110 insertions(+), 31 deletions(-) (limited to 'module/zfs/arc.c') diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 29da08a49..e7ad976af 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -301,6 +301,7 @@ #include #include #include +#include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ @@ -854,7 +855,6 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); -static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -864,6 +864,23 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); +/* + * L2ARC TRIM + * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of + * the current write size (l2arc_write_max) we should TRIM if we + * have filled the device. It is defined as a percentage of the + * write size. If set to 100 we trim twice the space required to + * accommodate upcoming writes. A minimum of 64MB will be trimmed. + * It also enables TRIM of the whole L2ARC device upon creation or + * addition to an existing pool or if the header of the device is + * invalid upon importing a pool or onlining a cache device. The + * default is 0, which disables TRIM on L2ARC altogether as it can + * put significant stress on the underlying storage devices. This + * will vary depending of how well the specific device handles + * these commands. + */ +unsigned long l2arc_trim_ahead = 0; + /* * Performance tuning of L2ARC persistence: * @@ -902,7 +919,6 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_dev_hdr_update(l2arc_dev_t *dev); static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); @@ -7709,7 +7725,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size; + uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user @@ -7732,7 +7748,12 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; - if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { + tsize = size + l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + tsize += MAX(64 * 1024 * 1024, + (tsize * l2arc_trim_ahead) / 100); + + if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " @@ -7810,10 +7831,12 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; @@ -8336,8 +8359,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; - boolean_t rerun; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + vdev_t *vd = dev->l2ad_vdev; + boolean_t rerun; buflist = &dev->l2ad_buflist; @@ -8345,6 +8369,14 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * We need to add in the worst case scenario of log block overhead. */ distance += l2arc_log_blk_overhead(distance, dev); + if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the write size, whichever is greater. + */ + distance += MAX(64 * 1024 * 1024, + (distance * l2arc_trim_ahead) / 100); + } top: rerun = B_FALSE; @@ -8365,25 +8397,51 @@ top: DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); - /* - * This check has to be placed after deciding whether to iterate - * (rerun). - */ - if (!all && dev->l2ad_first) { + if (!all) { /* - * This is the first sweep through the device. There is - * nothing to evict. + * This check has to be placed after deciding whether to + * iterate (rerun). */ - goto out; - } + if (dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. We have already trimmmed the + * whole device. + */ + goto out; + } else { + /* + * Trim the space to be evicted. + */ + if (vd->vdev_has_trim && dev->l2ad_evict < taddr && + l2arc_trim_ahead > 0) { + /* + * We have to drop the spa_config lock because + * vdev_trim_range() will acquire it. + * l2ad_evict already accounts for the label + * size. To prevent vdev_trim_ranges() from + * adding it again, we subtract it from + * l2ad_evict. + */ + spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); + vdev_trim_simple(vd, + dev->l2ad_evict - VDEV_LABEL_START_SIZE, + taddr - dev->l2ad_evict); + spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, + RW_READER); + } - /* - * When rebuilding L2ARC we retrieve the evict hand from the header of - * the device. Of note, l2arc_evict() does not actually delete buffers - * from the cache device, but keeping track of the evict hand will be - * useful when TRIM is implemented. - */ - dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + /* + * When rebuilding L2ARC we retrieve the evict hand + * from the header of the device. Of note, l2arc_evict() + * does not actually delete buffers from the cache + * device, but trimming may do so depending on the + * hardware implementation. Thus keeping track of the + * evict hand is useful. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + } + } retry: mutex_enter(&dev->l2ad_mtx); @@ -8410,7 +8468,7 @@ retry: if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, @@ -9015,7 +9073,7 @@ l2arc_vdev_present(vdev_t *vd) * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ -static l2arc_dev_t * +l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; @@ -9059,6 +9117,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); @@ -9164,11 +9223,21 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* - * In this case create a new header. We zero out the memory - * holding the header to reset dh_start_lbps. + * In this case TRIM the whole device if l2arc_trim_ahead > 0, + * otherwise create a new header. We zero out the memory holding + * the header to reset dh_start_lbps. If we TRIM the whole + * device the new header will be written by + * vdev_trim_l2arc_thread() at the end of the TRIM to update the + * trim_state in the header too. When reading the header, if + * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 + * we opt to TRIM the whole device again. */ - bzero(l2dhdr, l2dhdr_asize); - l2arc_dev_hdr_update(dev); + if (l2arc_trim_ahead > 0) { + dev->l2ad_trim_all = B_TRUE; + } else { + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } } @@ -9385,6 +9454,9 @@ l2arc_rebuild(l2arc_dev_t *dev) dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; + vd->vdev_trim_state = l2dhdr->dh_trim_state; + /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. @@ -9594,7 +9666,9 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, - l2dhdr->dh_evict)) { + l2dhdr->dh_evict) || + (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && + l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another @@ -9903,7 +9977,7 @@ l2arc_log_blk_fetch_abort(zio_t *zio) * Creates a zio to update the device header on an l2arc device. The zio is * initiated as a child of `pio'. */ -static void +void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; @@ -9924,6 +9998,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; + l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; + l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; @@ -10260,6 +10336,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, "Compressed l2arc_headroom multiplier"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, + "TRIM ahead L2ARC write size multiplier"); + ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, "Seconds between L2ARC writing"); -- cgit v1.2.3