diff options
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r-- | module/zfs/arc.c | 1430 |
1 files changed, 1384 insertions, 46 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b5d17431c..74bfbfc70 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,10 +21,11 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. - * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2014, Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K <[email protected]>. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. */ /* @@ -528,6 +529,20 @@ arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_success", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_psize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, @@ -582,6 +597,24 @@ arc_stats_t arc_stats = { } \ } +/* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ +#define ARCSTAT_F_AVG_FACTOR 3 +#define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru_ghost; @@ -805,6 +838,9 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static kmutex_t l2arc_rebuild_thr_lock; +static kcondvar_t l2arc_rebuild_thr_cv; + static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); @@ -816,6 +852,7 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); +static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -825,6 +862,58 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); +/* + * Performance tuning of L2ARC persistence: + * + * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding + * an L2ARC device (either at pool import or later) will attempt + * to rebuild L2ARC buffer contents. + * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls + * whether log blocks are written to the L2ARC device. If the L2ARC + * device is less than 1GB, the amount of data l2arc_evict() + * evicts is significant compared to the amount of restored L2ARC + * data. In this case do not write log blocks in L2ARC in order + * not to waste space. + */ +int l2arc_rebuild_enabled = B_TRUE; +unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; + +/* L2ARC persistence rebuild control routines. */ +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +static void l2arc_dev_rebuild_start(l2arc_dev_t *dev); +static int l2arc_rebuild(l2arc_dev_t *dev); + +/* L2ARC persistence read I/O routines. */ +static int l2arc_dev_hdr_read(l2arc_dev_t *dev); +static int l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io); +static zio_t *l2arc_log_blk_fetch(vdev_t *vd, + const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); +static void l2arc_log_blk_fetch_abort(zio_t *zio); + +/* L2ARC persistence block restoration routines. */ +static void l2arc_log_blk_restore(l2arc_dev_t *dev, + const l2arc_log_blk_phys_t *lb, uint64_t lb_psize, uint64_t lb_daddr); +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, + l2arc_dev_t *dev); + +/* L2ARC persistence write I/O routines. */ +static void l2arc_dev_hdr_update(l2arc_dev_t *dev); +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + +/* L2ARC persistence auxilliary routines. */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, + const arc_buf_hdr_t *ab); +boolean_t l2arc_range_check_overlap(uint64_t bottom, + uint64_t top, uint64_t check); +static void l2arc_blk_fetch_done(zio_t *zio); +static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without @@ -1584,6 +1673,42 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) } /* + * Allocates an ARC buf header that's in an evicted & L2-cached state. + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc. + */ +arc_buf_hdr_t * +arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + enum zio_compress compress, boolean_t protected, boolean_t prefetch) +{ + arc_buf_hdr_t *hdr; + + ASSERT(size != 0); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); + hdr->b_birth = birth; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); + HDR_SET_LSIZE(hdr, size); + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + if (prefetch) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); + + hdr->b_dva = dva; + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = daddr; + + return (hdr); +} + +/* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t @@ -7463,6 +7588,103 @@ arc_fini(void) * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistence: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) When writing to the L2ARC, we occasionally write a "l2arc log block", + * which is an additional piece of metadata which describes what's been + * written. This allows us to rebuild the arc_buf_hdr_t structures of the + * main ARC buffers. There are 2 linked-lists of log blocks headed by + * dh_start_lbps[2]. We alternate which chain we append to, so they are + * time-wise and offset-wise interleaved, but that is an optimization rather + * than for correctness. The log block also includes a pointer to the + * previous block in its chain. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, + * which contains our top-level reference structures. We update it each + * time we write a new log block, so that we're able to locate it in the + * L2ARC device. If this write results in an inconsistent device header + * (e.g. due to power failure), we detect this by verifying the header's + * checksum and simply fail to reconstruct the L2ARC after reboot. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ___two newest log block pointers__.__________ | + * | / \dh_start_lbps[1] | + * | / \ \dh_start_lbps[0]| + * |.___/__. V V | + * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * || hdr| ^ /^ /^ / / | + * |+------+ ...--\-------/ \-----/--\------/ / | + * | \--------------/ \--------------/ | + * +======================================================================+ + * + * As can be seen on the diagram, rather than using a simple linked list, + * we use a pair of linked lists with alternating elements. This is a + * performance enhancement due to the fact that we only find out the + * address of the next log block access once the current block has been + * completely read in. Obviously, this hurts performance, because we'd be + * keeping the device's I/O queue at only a 1 operation deep, thus + * incurring a large amount of I/O round-trip latency. Having two lists + * allows us to fetch two log blocks ahead of where we are currently + * rebuilding L2ARC buffers. + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <<nextwrite>> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. + * + * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write + * hand are not restored. This is done by saving the offset (in bytes) + * l2arc_evict() has evicted to in the L2ARC device header and taking it + * into account when restoring buffers. */ static boolean_t @@ -7508,10 +7730,12 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; - if (size >= dev_size) { + if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " - "exceeds the size of the cache device (guid %llu), " - "resetting them to the default (%d)", + "plus the overhead of log blocks (persistent L2ARC, " + "%llu bytes) exceeds the size of the cache device " + "(guid %llu), resetting them to the default (%d)", + l2arc_log_blk_overhead(size, dev), dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; @@ -7584,10 +7808,10 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev)); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) next = NULL; l2arc_dev_last = next; @@ -7636,12 +7860,14 @@ l2arc_do_free_on_write(void) static void l2arc_write_done(zio_t *zio) { - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; + l2arc_write_callback_t *cb; + l2arc_lb_abd_buf_t *abd_buf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); @@ -7738,12 +7964,33 @@ top: mutex_exit(hash_lock); } + /* + * Free the allocated abd buffers for writing the log blocks. + * If the zio failed reclaim the allocated space and remove the + * pointers to these log blocks from the log block pointer list + * of the L2ARC device. + */ + while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { + abd_free(abd_buf->abd); + zio_buf_free(abd_buf, sizeof (*abd_buf)); + if (zio->io_error != 0) { + lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); + bytes_dropped += + L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + list_destroy(&cb->l2wcb_abd_list); + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); + ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -8029,8 +8276,31 @@ l2arc_sublist_lock(int list_num) } /* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ +static inline uint64_t +l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) +{ + if (dev->l2ad_dev_hdr->dh_log_blk_ent == 0) { + return (0); + } else { + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; + + uint64_t log_blocks = (log_entries + + dev->l2ad_dev_hdr->dh_log_blk_ent - 1) / + dev->l2ad_dev_hdr->dh_log_blk_ent; + + return (vdev_psize_to_asize(dev->l2ad_vdev, + sizeof (l2arc_log_blk_phys_t)) * log_blocks); + } +} + +/* * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. + * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ @@ -8042,19 +8312,25 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) kmutex_t *hash_lock; uint64_t taddr; boolean_t rerun; + l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; buflist = &dev->l2ad_buflist; + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance, dev); + top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* * When there is no space to accomodate upcoming writes, - * evict to the end. Then bump the write hand to the start - * and iterate. This iteration does not happen indefinitely - * as we make sure in l2arc_write_size() that when l2ad_hand - * is reset, the write size does not exceed the end of the - * device. + * evict to the end. Then bump the write and evict hands + * to the start and iterate. This iteration does not + * happen indefinitely as we make sure in + * l2arc_write_size() that when the write hand is reset, + * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; @@ -8064,16 +8340,57 @@ top: DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); + /* + * This check has to be placed after deciding whether to iterate + * (rerun). + */ if (!all && dev->l2ad_first) { /* - * This is the first sweep through the device. There is + * This is the first sweep through the device. There is * nothing to evict. */ goto out; } + /* + * When rebuilding L2ARC we retrieve the evict hand from the header of + * the device. Of note, l2arc_evict() does not actually delete buffers + * from the cache device, but keeping track of the evict hand will be + * useful when TRIM is implemented. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + retry: mutex_enter(&dev->l2ad_mtx); + /* + * We have to account for evicted log blocks. Run vdev_space_update() + * on log blocks whose offset (in bytes) is before the evicted offset + * (in bytes) by searching in the list of pointers to log blocks + * present in the L2ARC device. + */ + for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; + lb_ptr_buf = lb_ptr_buf_prev) { + + lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + + /* + * We don't worry about log blocks left behind (ie + * lbp_daddr + psize < l2ad_hand) because l2arc_write_buffers() + * will never write more than l2arc_evict() evicts. + */ + if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { + break; + } else { + vdev_space_update(dev->l2ad_vdev, + -L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop), 0, 0); + list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); @@ -8105,7 +8422,7 @@ retry: ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || + if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, @@ -8144,12 +8461,17 @@ retry: mutex_exit(&dev->l2ad_mtx); out: - if (rerun) { + /* + * We need to check if we evict all buffers, otherwise we may iterate + * unnecessarily. + */ + if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } @@ -8272,6 +8594,17 @@ error: return (ret); } +static void +l2arc_blk_fetch_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + + cb = zio->io_private; + if (cb->l2rcb_abd != NULL) + abd_put(cb->l2rcb_abd); + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + /* * Find and write ARC buffers to the L2ARC device. * @@ -8281,17 +8614,18 @@ error: * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). + * the delta by which the device hand has changed due to alignment and the + * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb = NULL; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -8343,7 +8677,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { + if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ @@ -8443,6 +8777,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + list_create(&cb->l2wcb_abd_list, + sizeof (l2arc_lb_abd_buf_t), + offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } @@ -8477,6 +8814,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) mutex_exit(hash_lock); + /* + * Append buf info to current log and commit if full. + * arcstat_l2_{size,asize} kstats are updated + * internally. + */ + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); + zio_nowait(wzio); } @@ -8491,6 +8836,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); + + /* + * Although we did not write any buffers l2ad_evict may + * have advanced. + */ + l2arc_dev_hdr_update(dev); + return (0); } @@ -8500,6 +8852,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); ARCSTAT_INCR(arcstat_l2_psize, write_psize); + l2arc_dev_hdr_update(dev); + dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; @@ -8611,7 +8965,17 @@ l2arc_feed_thread(void *unused) boolean_t l2arc_vdev_present(vdev_t *vd) { - l2arc_dev_t *dev; + return (l2arc_vdev_get(vd) != NULL); +} + +/* + * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if + * the vdev_t isn't an L2ARC device. + */ +static l2arc_dev_t * +l2arc_vdev_get(vdev_t *vd) +{ + l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; @@ -8621,7 +8985,7 @@ l2arc_vdev_present(vdev_t *vd) } mutex_exit(&l2arc_dev_mtx); - return (dev != NULL); + return (dev); } /* @@ -8631,22 +8995,29 @@ l2arc_vdev_present(vdev_t *vd) void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { - l2arc_dev_t *adddev; + l2arc_dev_t *adddev; + uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ - adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; + /* leave extra size for an l2arc device header */ + l2dhdr_asize = adddev->l2ad_dev_hdr_asize = + MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); + adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); + adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* @@ -8656,6 +9027,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + /* + * This is a list of pointers to log blocks that are still present + * on the device. + */ + list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), + offsetof(l2arc_lb_ptr_buf_t, node)); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); @@ -8666,6 +9044,89 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); + + /* + * Decide if vdev is eligible for L2ARC rebuild + */ + l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE); +} + +void +l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) +{ + l2arc_dev_t *dev = NULL; + l2arc_dev_hdr_phys_t *l2dhdr; + uint64_t l2dhdr_asize; + spa_t *spa; + int err; + boolean_t rebuild = B_TRUE; + + dev = l2arc_vdev_get(vd); + ASSERT3P(dev, !=, NULL); + spa = dev->l2ad_spa; + l2dhdr = dev->l2ad_dev_hdr; + l2dhdr_asize = dev->l2ad_dev_hdr_asize; + + /* + * The L2ARC has to hold at least the payload of one log block for + * them to be restored (persistent L2ARC). The payload of a log block + * depends on the amount of its log entries. We always write log blocks + * with 1022 entries. How many of them are committed or restored depends + * on the size of the L2ARC device. Thus the maximum payload of + * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device + * is less than that, we reduce the amount of committed and restored + * log entries per block so as to enable persistence. + */ + if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { + dev->l2ad_log_entries = 0; + } else { + dev->l2ad_log_entries = MIN((dev->l2ad_end - + dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, + L2ARC_LOG_BLK_MAX_ENTRIES); + } + + /* + * Read the device header, if an error is returned do not rebuild L2ARC. + */ + if ((err = l2arc_dev_hdr_read(dev)) != 0) + rebuild = B_FALSE; + + if (rebuild && l2dhdr->dh_log_blk_ent > 0) { + /* + * If we are onlining a cache device (vdev_reopen) that was + * still present (l2arc_vdev_present()) and rebuild is enabled, + * we should evict all ARC buffers and pointers to log blocks + * and reclaim their space before restoring its contents to + * L2ARC. + */ + if (reopen) { + if (!l2arc_rebuild_enabled) { + return; + } else { + l2arc_evict(dev, 0, B_TRUE); + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; + } + } + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + dev->l2ad_rebuild = B_TRUE; + } else if (!rebuild && spa_writeable(spa)) { + /* + * The boolean rebuild is false if reading the device header + * returned an error. In this case create a new header. We + * zero out the memory holding the header to reset + * dh_start_lbps. + */ + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } /* @@ -8674,24 +9135,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) void l2arc_remove_vdev(vdev_t *vd) { - l2arc_dev_t *dev, *nextdev, *remdev = NULL; + l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } + remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); + + /* * Remove device from global list */ + mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); @@ -8702,9 +9168,12 @@ l2arc_remove_vdev(vdev_t *vd) */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); - kmem_free(remdev, sizeof (l2arc_dev_t)); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); + vmem_free(remdev, sizeof (l2arc_dev_t)); } void @@ -8717,6 +9186,8 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -8741,6 +9212,8 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_rebuild_thr_lock); + cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); @@ -8772,6 +9245,865 @@ l2arc_stop(void) mutex_exit(&l2arc_feed_thr_lock); } +/* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called after pool import from the spa async thread, since starting + * these threads directly from spa_import() will make them part of the + * "zpool import" context and delay process exit (and thus pool import). + */ +void +l2arc_spa_rebuild_start(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild_began = B_TRUE; + (void) thread_create(NULL, 0, + (void (*)(void *))l2arc_dev_rebuild_start, + dev, 0, &p0, TS_RUN, minclsyspri); + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + +/* + * Main entry point for L2ARC rebuilding. + */ +static void +l2arc_dev_rebuild_start(l2arc_dev_t *dev) +{ + VERIFY(!dev->l2ad_rebuild_cancel); + VERIFY(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_began = B_FALSE; + dev->l2ad_rebuild = B_FALSE; + mutex_exit(&l2arc_rebuild_thr_lock); + + thread_exit(); +} + +/* + * This function implements the actual L2ARC metadata rebuild. It: + * starts reading the log block chain and restores each block's contents + * to memory (reconstructing arc_buf_hdr_t's). + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log block chain. + * 2) We encounter *any* error condition (cksum errors, io errors) + */ +static int +l2arc_rebuild(l2arc_dev_t *dev) +{ + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = vd->vdev_spa; + int i = 0, err = 0; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + l2arc_log_blk_phys_t *this_lb, *next_lb; + zio_t *this_io = NULL, *next_io = NULL; + l2arc_log_blkptr_t lbps[2]; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + boolean_t lock_held; + + this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); + + /* + * We prevent device removal while issuing reads to the device, + * then during the rebuilding phases we drop this lock again so + * that a spa_unload or device remove can be initiated - this is + * safe, because the spa will signal us to stop before removing + * our device and wait for us to stop. + */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + lock_held = B_TRUE; + + /* + * Retrieve the persistent L2ARC device state. + */ + dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); + dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), + dev->l2ad_start); + dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + /* + * In case the zfs module parameter l2arc_rebuild_enabled is false + * we do not start the rebuild process. + */ + if (!l2arc_rebuild_enabled) + goto out; + + /* Prepare the rebuild process */ + bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blkptr_valid(dev, &lbps[0])) + break; + + if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], + this_lb, next_lb, this_io, &next_io)) != 0) + goto out; + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blocks, so the user may choose to offline/ + * online the L2ARC dev at a later time (or re-import the pool) + * to reconstruct it (when there's less memory pressure). + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + goto out; + } + + spa_config_exit(spa, SCL_L2ARC, vd); + lock_held = B_FALSE; + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this log block. + */ + l2arc_log_blk_restore(dev, this_lb, + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), + lbps[0].lbp_daddr); + i++; + + /* + * log block restored, include its pointer in the list of + * pointers to log blocks present in the L2ARC device. + */ + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), + KM_SLEEP); + bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(vd, + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), 0, 0); + + /* + * Protection against loops of log blocks: + * + * l2ad_hand l2ad_evict + * V V + * l2ad_start |=======================================| l2ad_end + * -----|||----|||---|||----||| + * (3) (2) (1) (0) + * ---|||---|||----|||---||| + * (7) (6) (5) (4) + * + * In this situation the pointer of log block (4) passes + * l2arc_log_blkptr_valid() but the log block should not be + * restored as it is overwritten by the payload of log block + * (0). Only log blocks (0)-(3) should be restored. We check + * whether l2ad_evict lies in between the next log block + * offset (lbps[1].lbp_daddr) and the present log block offset + * (lbps[0].lbp_daddr). If true and this isn't the first pass, + * we are looping from the beginning and we should stop. + */ + if (l2arc_range_check_overlap(lbps[1].lbp_daddr, + lbps[0].lbp_daddr, dev->l2ad_evict) && !dev->l2ad_first) + goto out; + + for (;;) { + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); + mutex_exit(&l2arc_rebuild_thr_lock); + err = SET_ERROR(ECANCELED); + goto out; + } + mutex_exit(&l2arc_rebuild_thr_lock); + if (spa_config_tryenter(spa, SCL_L2ARC, vd, + RW_READER)) { + lock_held = B_TRUE; + break; + } + /* + * L2ARC config lock held by somebody in writer, + * possibly due to them trying to remove us. They'll + * likely to want us to shut down, so after a little + * delay, we check l2ad_rebuild_cancel and retry + * the lock again. + */ + delay(1); + } + + /* + * Continue with the next log block. + */ + lbps[0] = lbps[1]; + lbps[1] = this_lb->lb_prev_lbp; + PTR_SWAP(this_lb, next_lb); + this_io = next_io; + next_io = NULL; + } + + if (this_io != NULL) + l2arc_log_blk_fetch_abort(this_io); +out: + if (next_io != NULL) + l2arc_log_blk_fetch_abort(next_io); + vmem_free(this_lb, sizeof (*this_lb)); + vmem_free(next_lb, sizeof (*next_lb)); + + if (!l2arc_rebuild_enabled) { + zfs_dbgmsg("L2ARC rebuild disabled"); + } else if (err == 0 && i > 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_success); + zfs_dbgmsg("L2ARC successfully rebuilt, " + "restored %d blocks", i); + } else if (err != 0) { + zfs_dbgmsg("L2ARC rebuild aborted, " + "restored %d blocks", i); + } + + if (lock_held) + spa_config_exit(spa, SCL_L2ARC, vd); + + return (err); +} + +/* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `hdr'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ +static int +l2arc_dev_hdr_read(l2arc_dev_t *dev) +{ + int err; + uint64_t guid; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_SPECULATIVE, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + return (err); + } + + if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); + + if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || + l2dhdr->dh_spa_guid != guid || + l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || + l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || + l2dhdr->dh_log_blk_ent != dev->l2ad_log_entries || + l2dhdr->dh_end != dev->l2ad_end || + !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, + l2dhdr->dh_evict)) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool or from another + * version of persistent L2ARC. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +/* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple fetcher to make sure that while + * we're processing one buffer the L2ARC is already fetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log block + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. + * + * The `this_io' and `next_io' arguments are used for block fetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * fetched IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no fetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the fetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of fetch IOs. + */ +static int +l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io) +{ + int err = 0; + zio_cksum_t cksum; + abd_t *abd = NULL; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log block in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, + this_lb); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blkptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log block early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log block. + */ + *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, + next_lb); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " + "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr, + dev->l2ad_vdev->vdev_guid); + goto cleanup; + } + + /* Make sure the buffer checks out */ + fletcher_4_native(this_lb, + L2BLK_GET_PSIZE((this_lbp)->lbp_prop), NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); + zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " + "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", + this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid, + dev->l2ad_hand, dev->l2ad_evict); + err = SET_ERROR(ECKSUM); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(L2BLK_GET_PSIZE( + (this_lbp)->lbp_prop), B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, + L2BLK_GET_PSIZE((this_lbp)->lbp_prop)); + if ((err = zio_decompress_data( + L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), + abd, this_lb, L2BLK_GET_PSIZE((this_lbp)->lbp_prop), + sizeof (*this_lb))) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } +cleanup: + /* Abort an in-flight fetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_fetch_abort(*next_io); + *next_io = NULL; + } + if (abd != NULL) + abd_free(abd); + return (err); +} + +/* + * Restores the payload of a log block to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ +static void +l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, + uint64_t lb_psize, uint64_t lb_daddr) +{ + uint64_t size = 0, psize = 0; + uint64_t log_entries = dev->l2ad_dev_hdr->dh_log_blk_ent; + + for (int i = log_entries - 1; i >= 0; i--) { + /* + * Restore goes in the reverse temporal direction to preserve + * correct temporal ordering of buffers in the l2ad_buflist. + * l2arc_hdr_restore also does a list_insert_tail instead of + * list_insert_head on the l2ad_buflist: + * + * LIST l2ad_buflist LIST + * HEAD <------ (time) ------ TAIL + * direction +-----+-----+-----+-----+-----+ direction + * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild + * fill +-----+-----+-----+-----+-----+ + * ^ ^ + * | | + * | | + * l2arc_fill_thread l2arc_rebuild + * places new bufs here restores bufs here + * + * This also works when the restored bufs get evicted at any + * point during the rebuild. + */ + size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); + psize += L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop); + l2arc_hdr_restore(&lb->lb_entries[i], dev); + } + + /* + * Record rebuild stats: + * size Logical size of restored buffers in the L2ARC + * psize Physical size of restored buffers in the L2ARC + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); +} + +/* + * Restores a single ARC buf hdr from a log entry. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ +static void +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); + uint64_t asize; + + /* + * Do all the allocation before grabbing any locks, this lets us + * sleep if memory is full and we don't have to deal with failed + * allocations. + */ + hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, + dev, le->le_dva, le->le_daddr, + L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_COMPRESS((le)->le_prop), + L2BLK_GET_PROTECTED((le)->le_prop), + L2BLK_GET_PREFETCH((le)->le_prop)); + asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); + + /* + * vdev_space_update() has to be called before arc_hdr_destroy() to + * avoid underflow since the latter also calls the former. + */ + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); + + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + mutex_exit(&dev->l2ad_mtx); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + arc_hdr_destroy(hdr); + /* + * If the buffer is already cached, check whether it has + * L2ARC metadata. If not, enter them and update the flag. + * This is important is case of onlining a cache device, since + * we previously evicted all L2ARC metadata from ARC. + */ + if (!HDR_HAS_L2HDR(exists)) { + arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); + exists->b_l2hdr.b_dev = dev; + exists->b_l2hdr.b_daddr = le->le_daddr; + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, exists); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(exists), exists); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); + } + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + } + + mutex_exit(hash_lock); +} + +/* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain a newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_fetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ +static zio_t * +l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, + l2arc_log_blk_phys_t *lb) +{ + uint32_t psize; + zio_t *pio; + l2arc_read_callback_t *cb; + + psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(psize <= sizeof (l2arc_log_blk_phys_t)); + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); + cb->l2rcb_abd = abd_get_from_buf(lb, psize); + pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize, + cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); +} + +/* + * Aborts a zio returned from l2arc_log_blk_fetch and frees the data + * buffers allocated for it. + */ +static void +l2arc_log_blk_fetch_abort(zio_t *zio) +{ + (void) zio_wait(zio); +} + +/* + * Creates a zio to update the device header on an l2arc device. The zio is + * initiated as a child of `pio'. + */ +static void +l2arc_dev_hdr_update(l2arc_dev_t *dev) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + int err; + + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; + l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; + l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; + l2dhdr->dh_log_blk_ent = dev->l2ad_log_entries; + l2dhdr->dh_evict = dev->l2ad_evict; + l2dhdr->dh_start = dev->l2ad_start; + l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_flags = 0; + if (dev->l2ad_first) + l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + } +} + +/* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ +static void +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t psize, asize; + zio_t *wzio; + l2arc_lb_abd_buf_t *abd_buf; + uint8_t *tmpbuf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + + VERIFY3S(dev->l2ad_log_ent_idx, ==, l2dhdr->dh_log_blk_ent); + + tmpbuf = zio_buf_alloc(sizeof (*lb)); + abd_buf = zio_buf_alloc(sizeof (*abd_buf)); + abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); + + /* link the buffer into the block chain */ + lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; + lb->lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* try to compress the buffer */ + list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + psize = zio_compress_data(ZIO_COMPRESS_LZ4, + abd_buf->abd, tmpbuf, sizeof (*lb)); + + /* a log block is never entirely zero */ + ASSERT(psize != 0); + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + ASSERT(asize <= sizeof (*lb)); + + /* + * Update the start log block pointer in the device header to point + * to the log block we're about to write. + */ + l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; + l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; + l2dhdr->dh_start_lbps[0].lbp_payload_asize = + dev->l2ad_log_blk_payload_asize; + l2dhdr->dh_start_lbps[0].lbp_payload_start = + dev->l2ad_log_blk_payload_start; + _NOTE(CONSTCOND) + L2BLK_SET_LSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); + L2BLK_SET_PSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); + L2BLK_SET_CHECKSUM( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_CHECKSUM_FLETCHER_4); + if (asize < sizeof (*lb)) { + /* compression succeeded */ + bzero(tmpbuf + psize, asize - psize); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, tmpbuf, sizeof (*lb)); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_OFF); + } + + /* checksum what we're about to write */ + fletcher_4_native(tmpbuf, asize, NULL, + &l2dhdr->dh_start_lbps[0].lbp_cksum); + + abd_put(abd_buf->abd); + + /* perform the write itself */ + abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); + abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + dev->l2ad_hand += asize; + /* + * Include the committed log block's pointer in the list of pointers + * to log blocks present in the L2ARC device. + */ + bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; +} + +/* + * Validates an L2ARC log block address to make sure that it can be read + * from the provided L2ARC device. + */ +boolean_t +l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) +{ + uint64_t psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + psize - 1; + uint64_t start = lbp->lbp_payload_start; + boolean_t evicted = B_FALSE; + + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely (including its payload) between l2ad_start and + * l2ad_end + * - it has a valid size + * - neither the log block itself nor part of its payload was evicted + * by l2arc_evict(): + * + * l2ad_hand l2ad_evict + * | | lbp_daddr + * | start | | end + * | | | | | + * V V V V V + * l2ad_start ============================================ l2ad_end + * --------------------------|||| + * ^ ^ + * | log block + * payload + */ + + evicted = + l2arc_range_check_overlap(start, end, dev->l2ad_hand) || + l2arc_range_check_overlap(start, end, dev->l2ad_evict) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); + + return (start >= dev->l2ad_start && end <= dev->l2ad_end && + psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t) && + (!evicted || dev->l2ad_first)); +} + +/* + * Inserts ARC buffer header `hdr' into the current L2ARC log block on + * the device. The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log block is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ +static boolean_t +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + + if (l2dhdr->dh_log_blk_ent == 0) + return (B_FALSE); + + int index = dev->l2ad_log_ent_idx++; + + ASSERT3S(index, <, l2dhdr->dh_log_blk_ent); + ASSERT(HDR_HAS_L2HDR(hdr)); + + le = &lb->lb_entries[index]; + bzero(le, sizeof (*le)); + le->le_dva = hdr->b_dva; + le->le_birth = hdr->b_birth; + le->le_daddr = hdr->b_l2hdr.b_daddr; + if (index == 0) + dev->l2ad_log_blk_payload_start = le->le_daddr; + L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); + L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); + L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); + L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); + L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + + dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, + HDR_GET_PSIZE(hdr)); + + return (dev->l2ad_log_ent_idx == l2dhdr->dh_log_blk_ent); +} + +/* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom -- Lower end of the range to check (written to earlier). + * top -- Upper end of the range to check (written to later). + * check -- The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * <check>--------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------<bottom>============<top>--------------| + * + * bottom > top: Looped-around case: + * <check>--------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============<top>---------------<bottom>===========| + * ^ ^ + * | (or here?) | + * +---------------+---------<check> + * + * top == bottom : Just a single address comparison. + */ +boolean_t +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) +{ + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); +} + EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); @@ -8861,6 +10193,12 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, + "Rebuild the L2ARC when importing a pool"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, + "Min size in bytes to write rebuild log blocks in L2ARC"); + ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); |