diff options
-rw-r--r-- | cmd/zdb/zdb.c | 123 | ||||
-rw-r--r-- | include/sys/arc_impl.h | 40 | ||||
-rw-r--r-- | man/man8/zdb.8 | 5 | ||||
-rw-r--r-- | module/zfs/arc.c | 234 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh | 2 |
5 files changed, 278 insertions, 126 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index f4b4b454b..00258799b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3493,12 +3493,13 @@ print_l2arc_log_blocks(void) static void dump_l2arc_log_entries(uint64_t log_entries, - l2arc_log_ent_phys_t *le, int i) + l2arc_log_ent_phys_t *le, uint64_t i) { for (int j = 0; j < log_entries; j++) { dva_t dva = le[j].le_dva; - (void) printf("lb[%4d]\tle[%4d]\tDVA asize: %llu, " - "vdev: %llu, offset: %llu\n", i, j + 1, + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, (u_longlong_t)DVA_GET_ASIZE(&dva), (u_longlong_t)DVA_GET_VDEV(&dva), (u_longlong_t)DVA_GET_OFFSET(&dva)); @@ -3533,7 +3534,7 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) (u_longlong_t)lbps.lbp_payload_start); (void) printf("|\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); - (void) printf("|\t\tpsize: %llu\n", + (void) printf("|\t\tasize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); (void) printf("|\t\tcompralgo: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); @@ -3543,17 +3544,19 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) } static void -dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) { l2arc_log_blk_phys_t this_lb; - uint64_t psize; + uint64_t asize; l2arc_log_blkptr_t lbps[2]; abd_t *abd; zio_cksum_t cksum; - int i = 0, failed = 0; + int failed = 0; l2arc_dev_t dev; - print_l2arc_log_blocks(); + if (!dump_opt['q']) + print_l2arc_log_blocks(); bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); dev.l2ad_evict = l2dhdr.dh_evict; @@ -3562,8 +3565,10 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { /* no log blocks to read */ - (void) printf("No log blocks to read\n"); - (void) printf("\n"); + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } return; } else { dev.l2ad_hand = lbps[0].lbp_daddr + @@ -3576,17 +3581,23 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) break; - psize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); - if (pread64(fd, &this_lb, psize, lbps[0].lbp_daddr) != psize) { - (void) printf("Error while reading next log block\n\n"); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } break; } - fletcher_4_native_varsize(&this_lb, psize, &cksum); + fletcher_4_native_varsize(&this_lb, asize, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { failed++; - (void) printf("Invalid cksum\n"); - dump_l2arc_log_blkptr(lbps[0]); + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } break; } @@ -3594,11 +3605,11 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(psize, B_TRUE); - abd_copy_from_buf_off(abd, &this_lb, 0, psize); + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); zio_decompress_data(L2BLK_GET_COMPRESS( (&lbps[0])->lbp_prop), abd, &this_lb, - psize, sizeof (this_lb)); + asize, sizeof (this_lb)); abd_free(abd); break; default: @@ -3608,39 +3619,52 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { - (void) printf("Invalid log block magic\n\n"); + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); break; } - i++; - if (dump_opt['l'] > 1) { - (void) printf("lb[%4d]\tmagic: %llu\n", i, + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, (u_longlong_t)this_lb.lb_magic); dump_l2arc_log_blkptr(lbps[0]); } - if (dump_opt['l'] > 2) - dump_l2arc_log_entries(l2dhdr.dh_log_blk_ent, - this_lb.lb_entries, i); + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); - if (l2arc_range_check_overlap(lbps[1].lbp_daddr, - lbps[0].lbp_daddr, dev.l2ad_evict) && !dev.l2ad_first) + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) break; lbps[0] = lbps[1]; lbps[1] = this_lb.lb_prev_lbp; } - (void) printf("log_blk_count:\t %d with valid cksum\n", i); - (void) printf("\t\t %d with invalid cksum\n\n", failed); + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } } -static void +static int dump_l2arc_header(int fd) { - l2arc_dev_hdr_phys_t l2dhdr; + l2arc_dev_hdr_phys_t l2dhdr, rebuild; int error = B_FALSE; + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; @@ -3654,6 +3678,8 @@ dump_l2arc_header(int fd) if (error) { (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); } else if (!dump_opt['q']) { print_l2arc_header(); @@ -3672,16 +3698,39 @@ dump_l2arc_header(int fd) (u_longlong_t) l2dhdr.dh_start_lbps[1].lbp_daddr); (void) printf(" log_blk_ent: %llu\n", - (u_longlong_t)l2dhdr.dh_log_blk_ent); + (u_longlong_t)l2dhdr.dh_log_entries); (void) printf(" start: %llu\n", (u_longlong_t)l2dhdr.dh_start); (void) printf(" end: %llu\n", (u_longlong_t)l2dhdr.dh_end); - (void) printf(" evict: %llu\n\n", + (void) printf(" evict: %llu\n", (u_longlong_t)l2dhdr.dh_evict); - - dump_l2arc_log_blocks(fd, l2dhdr); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n\n", + (u_longlong_t)l2dhdr.dh_lb_count); } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); } static void @@ -4009,7 +4058,7 @@ dump_label(const char *dev) * Dump the L2ARC header, if existent. */ if (read_l2arc_header) - dump_l2arc_header(fd); + error |= dump_l2arc_header(fd); cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 928b72325..e8c944ce8 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -200,7 +200,7 @@ typedef struct l2arc_log_blkptr { /* * lbp_prop has the following format: * * logical size (in bytes) - * * physical (compressed) size (in bytes) + * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ @@ -221,22 +221,26 @@ typedef struct l2arc_dev_hdr_phys { */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; - uint64_t dh_log_blk_ent; /* entries per log blk */ + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ - uint64_t dh_start; - uint64_t dh_end; - + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; - const uint64_t dh_pad[34]; /* pad to 512 bytes */ + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + const uint64_t dh_pad[32]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); @@ -387,6 +391,14 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; } l2arc_dev_t; /* @@ -738,14 +750,18 @@ typedef struct arc_stats { */ kstat_named_t arcstat_l2_log_blk_writes; /* - * Moving average of the physical size of the L2ARC log blocks, in + * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ - kstat_named_t arcstat_l2_log_blk_avg_size; + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; /* - * Moving average of the physical size of L2ARC restored data, in bytes, - * to the physical size of their metadata in ARC, in bytes. + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; @@ -780,6 +796,8 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. @@ -790,8 +808,6 @@ typedef struct arc_stats { * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; - /* Physical size of L2ARC restored data, in bytes. */ - kstat_named_t arcstat_l2_rebuild_psize; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 3915be3f8..01ad95b3e 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -216,7 +216,10 @@ Read the vdev labels and L2ARC header from the specified device. .Nm Fl l will return 0 if valid label was found, 1 if error occurred, and 2 if no valid labels were found. The presence of L2ARC header is indicated by a specific -sequence (L2ARC_DEV_HDR_MAGIC). Each unique configuration is displayed only +sequence (L2ARC_DEV_HDR_MAGIC). If there is an accounting error in the size +or the number of L2ARC log blocks +.Nm Fl l +will return 1. Each unique configuration is displayed only once. .It Fl ll Ar device In addition display label space usage stats. If a valid L2ARC header was found diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5b34d4abd..a6b739ec3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -530,7 +530,9 @@ arc_stats_t arc_stats = { { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, - { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, @@ -539,9 +541,9 @@ arc_stats_t arc_stats = { { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, - { "l2_rebuild_psize", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, @@ -895,7 +897,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, - const l2arc_log_blk_phys_t *lb, uint64_t lb_psize, uint64_t lb_daddr); + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); @@ -7864,6 +7866,7 @@ l2arc_write_done(zio_t *zio) l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; @@ -7872,6 +7875,7 @@ l2arc_write_done(zio_t *zio) cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -7975,8 +7979,18 @@ top: zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); - bytes_dropped += + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); @@ -7984,6 +7998,17 @@ top: } list_destroy(&cb->l2wcb_abd_list); + if (zio->io_error != 0) { + /* restore the lbps array in the header to its previous state */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); @@ -8277,21 +8302,21 @@ l2arc_sublist_lock(int list_num) /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given - * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { - if (dev->l2ad_dev_hdr->dh_log_blk_ent == 0) { + if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + - dev->l2ad_dev_hdr->dh_log_blk_ent - 1) / - dev->l2ad_dev_hdr->dh_log_blk_ent; + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); @@ -8373,17 +8398,24 @@ retry: lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + /* * We don't worry about log blocks left behind (ie - * lbp_daddr + psize < l2ad_hand) because l2arc_write_buffers() + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(dev->l2ad_vdev, - -L2BLK_GET_PSIZE( - (lb_ptr_buf->lb_ptr)->lbp_prop), 0, 0); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); @@ -8475,6 +8507,10 @@ out: dev->l2ad_first = B_FALSE; goto top; } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* @@ -8777,6 +8813,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -8846,6 +8886,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); @@ -9036,6 +9079,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list @@ -9059,7 +9104,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) uint64_t l2dhdr_asize; spa_t *spa; int err; - boolean_t rebuild = B_TRUE; + boolean_t l2dhdr_valid = B_TRUE; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); @@ -9089,9 +9134,9 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) * Read the device header, if an error is returned do not rebuild L2ARC. */ if ((err = l2arc_dev_hdr_read(dev)) != 0) - rebuild = B_FALSE; + l2dhdr_valid = B_FALSE; - if (rebuild && l2dhdr->dh_log_blk_ent > 0) { + if (l2dhdr_valid && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, @@ -9117,12 +9162,10 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; - } else if (!rebuild && spa_writeable(spa)) { + } else if (spa_writeable(spa)) { /* - * The boolean rebuild is false if reading the device header - * returned an error. In this case create a new header. We - * zero out the memory holding the header to reset - * dh_start_lbps. + * In this case create a new header. We zero out the memory + * holding the header to reset dh_start_lbps. */ bzero(l2dhdr, l2dhdr_asize); l2arc_dev_hdr_update(dev); @@ -9172,6 +9215,8 @@ l2arc_remove_vdev(vdev_t *vd) list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -9309,7 +9354,7 @@ l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; - int i = 0, err = 0; + int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; @@ -9332,6 +9377,7 @@ l2arc_rebuild(l2arc_dev_t *dev) /* * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + @@ -9381,11 +9427,10 @@ l2arc_rebuild(l2arc_dev_t *dev) /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. */ - l2arc_log_blk_restore(dev, this_lb, - L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), - lbps[0].lbp_daddr); - i++; + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); /* * log block restored, include its pointer in the list of @@ -9398,9 +9443,12 @@ l2arc_rebuild(l2arc_dev_t *dev) sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(vd, - L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), 0, 0); + vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: @@ -9417,13 +9465,16 @@ l2arc_rebuild(l2arc_dev_t *dev) * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check - * whether l2ad_evict lies in between the next log block - * offset (lbps[1].lbp_daddr) and the present log block offset - * (lbps[0].lbp_daddr). If true and this isn't the first pass, - * we are looping from the beginning and we should stop. + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. */ - if (l2arc_range_check_overlap(lbps[1].lbp_daddr, - lbps[0].lbp_daddr, dev->l2ad_evict) && !dev->l2ad_first) + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) goto out; for (;;) { @@ -9470,14 +9521,27 @@ out: vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { - zfs_dbgmsg("L2ARC rebuild disabled"); - } else if (err == 0 && i > 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); - zfs_dbgmsg("L2ARC successfully rebuilt, " - "restored %d blocks", i); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); } else if (err != 0) { - zfs_dbgmsg("L2ARC rebuild aborted, " - "restored %d blocks", i); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) @@ -9527,7 +9591,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || - l2dhdr->dh_log_blk_ent != dev->l2ad_log_entries || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict)) { @@ -9578,7 +9642,7 @@ l2arc_log_blk_read(l2arc_dev_t *dev, int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; - uint64_t psize; + uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); @@ -9616,9 +9680,12 @@ l2arc_log_blk_read(l2arc_dev_t *dev, goto cleanup; } - /* Make sure the buffer checks out */ - psize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); - fletcher_4_native(this_lb, psize, NULL, &cksum); + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " @@ -9634,11 +9701,11 @@ l2arc_log_blk_read(l2arc_dev_t *dev, case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(psize, B_TRUE); - abd_copy_from_buf_off(abd, this_lb, 0, psize); + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), - abd, this_lb, psize, sizeof (*this_lb))) != 0) { + abd, this_lb, asize, sizeof (*this_lb))) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } @@ -9672,10 +9739,10 @@ cleanup: */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, - uint64_t lb_psize, uint64_t lb_daddr) + uint64_t lb_asize, uint64_t lb_daddr) { - uint64_t size = 0, psize = 0; - uint64_t log_entries = dev->l2ad_dev_hdr->dh_log_blk_ent; + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; for (int i = log_entries - 1; i >= 0; i--) { /* @@ -9692,27 +9759,28 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * ^ ^ * | | * | | - * l2arc_fill_thread l2arc_rebuild - * places new bufs here restores bufs here + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here * - * This also works when the restored bufs get evicted at any - * point during the rebuild. + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); - psize += L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC - * psize Physical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); - ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); - ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize); - ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } @@ -9800,18 +9868,20 @@ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { - uint32_t psize; + uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; - psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); - ASSERT(psize <= sizeof (l2arc_log_blk_phys_t)); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_abd = abd_get_from_buf(lb, psize); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); - (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize, + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); @@ -9841,14 +9911,18 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) abd_t *abd; int err; + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; - l2dhdr->dh_log_blk_ent = dev->l2ad_log_entries; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; @@ -9884,7 +9958,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) uint8_t *tmpbuf; l2arc_lb_ptr_buf_t *lb_ptr_buf; - VERIFY3S(dev->l2ad_log_ent_idx, ==, l2dhdr->dh_log_blk_ent); + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); @@ -9896,8 +9970,14 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; - /* try to compress the buffer */ + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, tmpbuf, sizeof (*lb)); @@ -9962,13 +10042,17 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); - ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); @@ -9985,8 +10069,9 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { - uint64_t psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); - uint64_t end = lbp->lbp_daddr + psize - 1; + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; @@ -10017,7 +10102,7 @@ l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && - psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t) && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } @@ -10032,14 +10117,13 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; - l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; - if (l2dhdr->dh_log_blk_ent == 0) + if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; - ASSERT3S(index, <, l2dhdr->dh_log_blk_ent); + ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; @@ -10059,7 +10143,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); - return (dev->l2ad_log_ent_idx == l2dhdr->dh_log_blk_ent); + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh index b202fac40..f313923d1 100755 --- a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh @@ -99,7 +99,7 @@ typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 -log_must zdb -lq $VDEV_CACHE +log_must zdb -lll $VDEV_CACHE log_must zpool destroy -f $TESTPOOL |