diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/os/linux/zfs/spa_stats.c | 15 | ||||
-rw-r--r-- | module/zfs/arc.c | 141 | ||||
-rw-r--r-- | module/zfs/spa.c | 20 | ||||
-rw-r--r-- | module/zfs/vdev.c | 13 | ||||
-rw-r--r-- | module/zfs/vdev_removal.c | 14 | ||||
-rw-r--r-- | module/zfs/vdev_trim.c | 245 |
6 files changed, 406 insertions, 42 deletions
diff --git a/module/os/linux/zfs/spa_stats.c b/module/os/linux/zfs/spa_stats.c index eae9c3f22..2ec32da46 100644 --- a/module/os/linux/zfs/spa_stats.c +++ b/module/os/linux/zfs/spa_stats.c @@ -903,6 +903,12 @@ static spa_iostats_t spa_iostats_template = { { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ @@ -929,13 +935,20 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); - } else { + } else if (type == TRIM_TYPE_AUTO) { SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); + } else { + SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); + SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); } } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 29da08a49..e7ad976af 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -301,6 +301,7 @@ #include <sys/trace_zfs.h> #include <sys/aggsum.h> #include <cityhash.h> +#include <sys/vdev_trim.h> #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ @@ -854,7 +855,6 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); -static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -865,6 +865,23 @@ static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); /* + * L2ARC TRIM + * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of + * the current write size (l2arc_write_max) we should TRIM if we + * have filled the device. It is defined as a percentage of the + * write size. If set to 100 we trim twice the space required to + * accommodate upcoming writes. A minimum of 64MB will be trimmed. + * It also enables TRIM of the whole L2ARC device upon creation or + * addition to an existing pool or if the header of the device is + * invalid upon importing a pool or onlining a cache device. The + * default is 0, which disables TRIM on L2ARC altogether as it can + * put significant stress on the underlying storage devices. This + * will vary depending of how well the specific device handles + * these commands. + */ +unsigned long l2arc_trim_ahead = 0; + +/* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding @@ -902,7 +919,6 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_dev_hdr_update(l2arc_dev_t *dev); static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); @@ -7709,7 +7725,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size; + uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user @@ -7732,7 +7748,12 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; - if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { + tsize = size + l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + tsize += MAX(64 * 1024 * 1024, + (tsize * l2arc_trim_ahead) / 100); + + if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " @@ -7810,10 +7831,12 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; @@ -8336,8 +8359,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; - boolean_t rerun; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + vdev_t *vd = dev->l2ad_vdev; + boolean_t rerun; buflist = &dev->l2ad_buflist; @@ -8345,6 +8369,14 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * We need to add in the worst case scenario of log block overhead. */ distance += l2arc_log_blk_overhead(distance, dev); + if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the write size, whichever is greater. + */ + distance += MAX(64 * 1024 * 1024, + (distance * l2arc_trim_ahead) / 100); + } top: rerun = B_FALSE; @@ -8365,25 +8397,51 @@ top: DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); - /* - * This check has to be placed after deciding whether to iterate - * (rerun). - */ - if (!all && dev->l2ad_first) { + if (!all) { /* - * This is the first sweep through the device. There is - * nothing to evict. + * This check has to be placed after deciding whether to + * iterate (rerun). */ - goto out; - } + if (dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. We have already trimmmed the + * whole device. + */ + goto out; + } else { + /* + * Trim the space to be evicted. + */ + if (vd->vdev_has_trim && dev->l2ad_evict < taddr && + l2arc_trim_ahead > 0) { + /* + * We have to drop the spa_config lock because + * vdev_trim_range() will acquire it. + * l2ad_evict already accounts for the label + * size. To prevent vdev_trim_ranges() from + * adding it again, we subtract it from + * l2ad_evict. + */ + spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); + vdev_trim_simple(vd, + dev->l2ad_evict - VDEV_LABEL_START_SIZE, + taddr - dev->l2ad_evict); + spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, + RW_READER); + } - /* - * When rebuilding L2ARC we retrieve the evict hand from the header of - * the device. Of note, l2arc_evict() does not actually delete buffers - * from the cache device, but keeping track of the evict hand will be - * useful when TRIM is implemented. - */ - dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + /* + * When rebuilding L2ARC we retrieve the evict hand + * from the header of the device. Of note, l2arc_evict() + * does not actually delete buffers from the cache + * device, but trimming may do so depending on the + * hardware implementation. Thus keeping track of the + * evict hand is useful. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + } + } retry: mutex_enter(&dev->l2ad_mtx); @@ -8410,7 +8468,7 @@ retry: if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, @@ -9015,7 +9073,7 @@ l2arc_vdev_present(vdev_t *vd) * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ -static l2arc_dev_t * +l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; @@ -9059,6 +9117,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); @@ -9164,11 +9223,21 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* - * In this case create a new header. We zero out the memory - * holding the header to reset dh_start_lbps. + * In this case TRIM the whole device if l2arc_trim_ahead > 0, + * otherwise create a new header. We zero out the memory holding + * the header to reset dh_start_lbps. If we TRIM the whole + * device the new header will be written by + * vdev_trim_l2arc_thread() at the end of the TRIM to update the + * trim_state in the header too. When reading the header, if + * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 + * we opt to TRIM the whole device again. */ - bzero(l2dhdr, l2dhdr_asize); - l2arc_dev_hdr_update(dev); + if (l2arc_trim_ahead > 0) { + dev->l2ad_trim_all = B_TRUE; + } else { + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } } @@ -9385,6 +9454,9 @@ l2arc_rebuild(l2arc_dev_t *dev) dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; + vd->vdev_trim_state = l2dhdr->dh_trim_state; + /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. @@ -9594,7 +9666,9 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, - l2dhdr->dh_evict)) { + l2dhdr->dh_evict) || + (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && + l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another @@ -9903,7 +9977,7 @@ l2arc_log_blk_fetch_abort(zio_t *zio) * Creates a zio to update the device header on an l2arc device. The zio is * initiated as a child of `pio'. */ -static void +void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; @@ -9924,6 +9998,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; + l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; + l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; @@ -10260,6 +10336,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, "Compressed l2arc_headroom multiplier"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, + "TRIM ahead L2ARC write size multiplier"); + ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, "Seconds between L2ARC writing"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 87af3073a..ba9a0dce3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1896,6 +1896,15 @@ spa_load_l2cache(spa_t *spa) if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); + + /* + * Upon cache device addition to a pool or pool + * creation with a cache device or if the header + * of the device is invalid we issue an async + * TRIM command for the whole device which will + * execute if l2arc_trim_ahead > 0. + */ + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } @@ -7994,6 +8003,17 @@ spa_async_thread(void *arg) } /* + * Kick off L2 cache whole device TRIM. + */ + if (tasks & SPA_ASYNC_L2CACHE_TRIM) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_trim_l2arc(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 923bf2e33..83c39d119 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2281,9 +2281,6 @@ vdev_reopen(vdev_t *vd) if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* - * When reopening we can assume the device label has - * already the attribute l2cache_persistent, since we've - * opened the device in the past and updated the label. * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. @@ -2294,6 +2291,7 @@ vdev_reopen(vdev_t *vd) l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); @@ -3542,9 +3540,14 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) } mutex_exit(&vd->vdev_initialize_lock); - /* Restart trimming if necessary */ + /* + * Restart trimming if necessary. We do not restart trimming for cache + * devices here. This is triggered by l2arc_rebuild_vdev() + * asynchronously for the whole device or in l2arc_evict() as it evicts + * space for upcoming writes. + */ mutex_enter(&vd->vdev_trim_lock); - if (vdev_writeable(vd) && + if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 3f4f9091f..56e420871 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -2224,6 +2224,20 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); + + /* + * Stop trimming the cache device. We need to release the + * config lock to allow the syncing of TRIM transactions + * without releasing the spa_namespace_lock. The same + * strategy is employed in spa_vdev_remove_top(). + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); + mutex_exit(&vd->vdev_trim_lock); + txg = spa_vdev_config_enter(spa); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index b0cd40f68..0254c2904 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -34,6 +34,7 @@ #include <sys/dsl_synctask.h> #include <sys/zap.h> #include <sys/dmu_tx.h> +#include <sys/arc_impl.h> /* * TRIM is a feature which is used to notify a SSD that some previously @@ -423,6 +424,35 @@ vdev_autotrim_cb(zio_t *zio) } /* + * The zio_done_func_t done callback for each TRIM issued via + * vdev_trim_simple(). It is responsible for updating the TRIM stats and + * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best + * effort and are never reissued on failure. + */ +static void +vdev_trim_simple_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + mutex_enter(&vd->vdev_trim_io_lock); + + if (zio->io_error != 0) { + vd->vdev_stat.vs_trim_errors++; + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 0, 0, 0, 0, 1, zio->io_orig_size); + } else { + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 1, zio->io_orig_size, 0, 0, 0, 0); + } + + ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); + vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; + cv_broadcast(&vd->vdev_trim_io_cv); + mutex_exit(&vd->vdev_trim_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} +/* * Returns the average trim rate in bytes/sec for the ta->trim_vdev. */ static uint64_t @@ -441,6 +471,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) { vdev_t *vd = ta->trim_vdev; spa_t *spa = vd->vdev_spa; + void *cb; mutex_enter(&vd->vdev_trim_io_lock); @@ -459,8 +490,8 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) ta->trim_bytes_done += size; /* Limit in flight trimming I/Os */ - while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >= - zfs_trim_queue_limit) { + while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + + vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } vd->vdev_trim_inflight[ta->trim_type]++; @@ -505,10 +536,17 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) if (ta->trim_type == TRIM_TYPE_MANUAL) vd->vdev_trim_offset[txg & TXG_MASK] = start + size; + if (ta->trim_type == TRIM_TYPE_MANUAL) { + cb = vdev_trim_cb; + } else if (ta->trim_type == TRIM_TYPE_AUTO) { + cb = vdev_autotrim_cb; + } else { + cb = vdev_trim_simple_cb; + } + zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, - start, size, ta->trim_type == TRIM_TYPE_MANUAL ? - vdev_trim_cb : vdev_autotrim_cb, NULL, - ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags)); + start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, + ta->trim_flags)); /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ dmu_tx_commit(tx); @@ -1016,6 +1054,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; + vdev_t *vd_l2cache; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -1023,6 +1062,17 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) offsetof(vdev_t, vdev_trim_node)); vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); + + /* + * Iterate over cache devices and request stop trimming the + * whole device in case we export the pool or remove the cache + * device prematurely. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; + vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); + } + vdev_trim_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { @@ -1437,6 +1487,189 @@ vdev_autotrim_restart(spa_t *spa) vdev_autotrim(spa); } +static void +vdev_trim_l2arc_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + trim_args_t ta; + range_seg64_t physical_rs; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_trim_last_offset = 0; + vd->vdev_trim_rate = 0; + vd->vdev_trim_partial = 0; + vd->vdev_trim_secure = 0; + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_MANUAL; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; + physical_rs.rs_end = vd->vdev_trim_bytes_est = + vdev_get_min_asize(vd); + + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + mutex_exit(&vd->vdev_trim_lock); + + (void) vdev_trim_ranges(&ta); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + mutex_enter(&vd->vdev_trim_lock); + if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { + vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } + ASSERT(vd->vdev_trim_thread != NULL || + vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); + + /* + * Drop the vdev_trim_lock while we sync out the txg since it's + * possible that a device might be trying to come online and + * must check to see if it needs to restart a trim. That thread + * will be holding the spa_config_lock which would prevent the + * txg_wait_synced from completing. Same strategy as in + * vdev_trim_thread(). + */ + mutex_exit(&vd->vdev_trim_lock); + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + mutex_enter(&vd->vdev_trim_lock); + + /* + * Update the header of the cache device here, before + * broadcasting vdev_trim_cv which may lead to the removal + * of the device. The same applies for setting l2ad_trim_all to + * false. + */ + spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, + RW_READER); + bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); + + vd->vdev_trim_thread = NULL; + if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) + dev->l2ad_trim_all = B_FALSE; + + cv_broadcast(&vd->vdev_trim_cv); + mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); +} + +/* + * Punches out TRIM threads for the L2ARC devices in a spa and assigns them + * to vd->vdev_trim_thread variable. This facilitates the management of + * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition + * to a pool or pool creation or when the header of the device is invalid. + */ +void +vdev_trim_l2arc(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off TRIM threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + + if (dev == NULL || !dev->l2ad_trim_all) { + /* + * Don't attempt TRIM if the vdev is UNAVAIL or if the + * cache device was not marked for whole device TRIM + * (ie l2arc_trim_ahead = 0, or the L2ARC device header + * is valid with trim_state = VDEV_TRIM_COMPLETE and + * l2ad_log_entries > 0). + */ + continue; + } + + mutex_enter(&vd->vdev_trim_lock); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_trim_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + vd->vdev_trim_thread = thread_create(NULL, 0, + vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&vd->vdev_trim_lock); + } +} + +/* + * A wrapper which calls vdev_trim_ranges(). It is intended to be called + * on leaf vdevs. + */ +int +vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) +{ + trim_args_t ta; + range_seg64_t physical_rs; + int error; + physical_rs.rs_start = start; + physical_rs.rs_end = start + size; + + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_top->vdev_removing); + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_SIMPLE; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } + + error = vdev_trim_ranges(&ta); + + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + return (error); +} + EXPORT_SYMBOL(vdev_trim); EXPORT_SYMBOL(vdev_trim_stop); EXPORT_SYMBOL(vdev_trim_stop_all); @@ -1446,6 +1679,8 @@ EXPORT_SYMBOL(vdev_autotrim); EXPORT_SYMBOL(vdev_autotrim_stop_all); EXPORT_SYMBOL(vdev_autotrim_stop_wait); EXPORT_SYMBOL(vdev_autotrim_restart); +EXPORT_SYMBOL(vdev_trim_l2arc); +EXPORT_SYMBOL(vdev_trim_simple); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, |