aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeorge Amanakis <[email protected]>2020-04-10 13:33:35 -0400
committerGitHub <[email protected]>2020-04-10 10:33:35 -0700
commit77f6826b83b7e27f0996f6d192202c36f65e41fd (patch)
treeb9946c99348bf6742cc41739aeff1a2b952d9d2f
parent36a6e2335c45212f2609269bcee3004908ac6bcb (diff)
Persistent L2ARC
This commit makes the L2ARC persistent across reboots. We implement a light-weight persistent L2ARC metadata structure that allows L2ARC contents to be recovered after a reboot. This significantly eases the impact a reboot has on read performance on systems with large caches. Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Co-authored-by: Saso Kiselkov <[email protected]> Co-authored-by: Jorgen Lundman <[email protected]> Co-authored-by: George Amanakis <[email protected]> Ported-by: Yuxuan Shui <[email protected]> Signed-off-by: George Amanakis <[email protected]> Closes #925 Closes #1823 Closes #2672 Closes #3744 Closes #9582
-rw-r--r--cmd/zdb/zdb.c229
-rw-r--r--configure.ac1
-rw-r--r--include/sys/arc.h4
-rw-r--r--include/sys/arc_impl.h305
-rw-r--r--include/sys/fs/zfs.h5
-rw-r--r--include/sys/spa.h1
-rw-r--r--lib/libzfs/libzfs_import.c31
-rw-r--r--man/man5/zfs-module-parameters.551
-rw-r--r--man/man8/zdb.816
-rw-r--r--man/man8/zpool-labelclear.85
-rw-r--r--man/man8/zpoolconcepts.824
-rw-r--r--module/os/linux/zfs/zfs_sysfs.c3
-rw-r--r--module/zfs/arc.c1430
-rw-r--r--module/zfs/spa.c13
-rw-r--r--module/zfs/vdev.c19
-rw-r--r--tests/runfiles/linux.run6
-rw-r--r--tests/zfs-tests/include/tunables.cfg2
-rw-r--r--tests/zfs-tests/tests/functional/Makefile.am1
-rw-r--r--tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am15
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh31
-rw-r--r--tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg37
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh106
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh112
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh87
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh101
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh108
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh98
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh95
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh143
-rwxr-xr-xtests/zfs-tests/tests/functional/persist_l2arc/setup.ksh29
30 files changed, 3020 insertions, 88 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index e9e801b11..dab0d8b68 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -62,6 +62,7 @@
#include <sys/zio_compress.h>
#include <sys/zfs_fuid.h>
#include <sys/arc.h>
+#include <sys/arc_impl.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
@@ -3475,6 +3476,216 @@ print_label_header(zdb_label_t *label, int l)
}
static void
+print_l2arc_header(void)
+{
+ (void) printf("------------------------------------\n");
+ (void) printf("L2ARC device header\n");
+ (void) printf("------------------------------------\n");
+}
+
+static void
+print_l2arc_log_blocks(void)
+{
+ (void) printf("------------------------------------\n");
+ (void) printf("L2ARC device log blocks\n");
+ (void) printf("------------------------------------\n");
+}
+
+static void
+dump_l2arc_log_entries(uint64_t log_entries,
+ l2arc_log_ent_phys_t *le, int i)
+{
+ for (int j = 0; j < log_entries; j++) {
+ dva_t dva = le[j].le_dva;
+ (void) printf("lb[%4d]\tle[%4d]\tDVA asize: %llu, "
+ "vdev: %llu, offset: %llu\n", i, j + 1,
+ (u_longlong_t)DVA_GET_ASIZE(&dva),
+ (u_longlong_t)DVA_GET_VDEV(&dva),
+ (u_longlong_t)DVA_GET_OFFSET(&dva));
+ (void) printf("|\t\t\t\tbirth: %llu\n",
+ (u_longlong_t)le[j].le_birth);
+ (void) printf("|\t\t\t\tlsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tpsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tcompr: %llu\n",
+ (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
+ (void) printf("|\t\t\t\ttype: %llu\n",
+ (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tprotected: %llu\n",
+ (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tprefetch: %llu\n",
+ (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
+ (void) printf("|\t\t\t\taddress: %llu\n",
+ (u_longlong_t)le[j].le_daddr);
+ (void) printf("|\n");
+ }
+ (void) printf("\n");
+}
+
+static void
+dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
+{
+ (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr);
+ (void) printf("|\t\tpayload_asize: %llu\n",
+ (u_longlong_t)lbps.lbp_payload_asize);
+ (void) printf("|\t\tpayload_start: %llu\n",
+ (u_longlong_t)lbps.lbp_payload_start);
+ (void) printf("|\t\tlsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop));
+ (void) printf("|\t\tpsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop));
+ (void) printf("|\t\tcompralgo: %llu\n",
+ (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop));
+ (void) printf("|\t\tcksumalgo: %llu\n",
+ (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop));
+ (void) printf("|\n\n");
+}
+
+static void
+dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
+{
+ l2arc_log_blk_phys_t this_lb;
+ uint64_t psize;
+ l2arc_log_blkptr_t lbps[2];
+ abd_t *abd;
+ zio_cksum_t cksum;
+ int i = 0, failed = 0;
+ l2arc_dev_t dev;
+
+ print_l2arc_log_blocks();
+ bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps));
+
+ dev.l2ad_evict = l2dhdr.dh_evict;
+ dev.l2ad_start = l2dhdr.dh_start;
+ dev.l2ad_end = l2dhdr.dh_end;
+
+ if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) {
+ /* no log blocks to read */
+ (void) printf("No log blocks to read\n");
+ (void) printf("\n");
+ return;
+ } else {
+ dev.l2ad_hand = lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ }
+
+ dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
+ break;
+
+ psize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ if (pread64(fd, &this_lb, psize, lbps[0].lbp_daddr) != psize) {
+ (void) printf("Error while reading next log block\n\n");
+ break;
+ }
+
+ fletcher_4_native_varsize(&this_lb, psize, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
+ failed++;
+ (void) printf("Invalid cksum\n");
+ dump_l2arc_log_blkptr(lbps[0]);
+ break;
+ }
+
+ switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ case ZIO_COMPRESS_LZ4:
+ abd = abd_alloc_for_io(psize, B_TRUE);
+ abd_copy_from_buf_off(abd, &this_lb, 0, psize);
+ zio_decompress_data(L2BLK_GET_COMPRESS(
+ (&lbps[0])->lbp_prop), abd, &this_lb,
+ psize, sizeof (this_lb));
+ abd_free(abd);
+ break;
+ default:
+ break;
+ }
+
+ if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(&this_lb, psize);
+
+ if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ (void) printf("Invalid log block magic\n\n");
+ break;
+ }
+
+ i++;
+ if (dump_opt['l'] > 1) {
+ (void) printf("lb[%4d]\tmagic: %llu\n", i,
+ (u_longlong_t)this_lb.lb_magic);
+ dump_l2arc_log_blkptr(lbps[0]);
+ }
+
+ if (dump_opt['l'] > 2)
+ dump_l2arc_log_entries(l2dhdr.dh_log_blk_ent,
+ this_lb.lb_entries, i);
+
+ if (l2arc_range_check_overlap(lbps[1].lbp_daddr,
+ lbps[0].lbp_daddr, dev.l2ad_evict) && !dev.l2ad_first)
+ break;
+
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb.lb_prev_lbp;
+ }
+
+ (void) printf("log_blk_count:\t %d with valid cksum\n", i);
+ (void) printf("\t\t %d with invalid cksum\n\n", failed);
+}
+
+static void
+dump_l2arc_header(int fd)
+{
+ l2arc_dev_hdr_phys_t l2dhdr;
+ int error = B_FALSE;
+
+ if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
+ VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
+ error = B_TRUE;
+ } else {
+ if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
+
+ if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
+ error = B_TRUE;
+ }
+
+ if (error) {
+ (void) printf("L2ARC device header not found\n\n");
+ } else if (!dump_opt['q']) {
+ print_l2arc_header();
+
+ (void) printf(" magic: %llu\n",
+ (u_longlong_t)l2dhdr.dh_magic);
+ (void) printf(" version: %llu\n",
+ (u_longlong_t)l2dhdr.dh_version);
+ (void) printf(" pool_guid: %llu\n",
+ (u_longlong_t)l2dhdr.dh_spa_guid);
+ (void) printf(" flags: %llu\n",
+ (u_longlong_t)l2dhdr.dh_flags);
+ (void) printf(" start_lbps[0]: %llu\n",
+ (u_longlong_t)
+ l2dhdr.dh_start_lbps[0].lbp_daddr);
+ (void) printf(" start_lbps[1]: %llu\n",
+ (u_longlong_t)
+ l2dhdr.dh_start_lbps[1].lbp_daddr);
+ (void) printf(" log_blk_ent: %llu\n",
+ (u_longlong_t)l2dhdr.dh_log_blk_ent);
+ (void) printf(" start: %llu\n",
+ (u_longlong_t)l2dhdr.dh_start);
+ (void) printf(" end: %llu\n",
+ (u_longlong_t)l2dhdr.dh_end);
+ (void) printf(" evict: %llu\n\n",
+ (u_longlong_t)l2dhdr.dh_evict);
+
+ dump_l2arc_log_blocks(fd, l2dhdr);
+ }
+}
+
+static void
dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
{
if (dump_opt['q'])
@@ -3639,10 +3850,11 @@ dump_label(const char *dev)
{
char path[MAXPATHLEN];
zdb_label_t labels[VDEV_LABELS];
- uint64_t psize, ashift;
+ uint64_t psize, ashift, l2cache;
struct stat64 statbuf;
boolean_t config_found = B_FALSE;
boolean_t error = B_FALSE;
+ boolean_t read_l2arc_header = B_FALSE;
avl_tree_t config_tree;
avl_tree_t uberblock_tree;
void *node, *cookie;
@@ -3735,6 +3947,15 @@ dump_label(const char *dev)
if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
size = buflen;
+ /* If the device is a cache device clear the header. */
+ if (!read_l2arc_header) {
+ if (nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
+ l2cache == POOL_STATE_L2CACHE) {
+ read_l2arc_header = B_TRUE;
+ }
+ }
+
fletcher_4_native_varsize(buf, size, &cksum);
rec = cksum_record_insert(&config_tree, &cksum, l);
@@ -3785,6 +4006,12 @@ dump_label(const char *dev)
nvlist_free(label->config_nv);
}
+ /*
+ * Dump the L2ARC header, if existent.
+ */
+ if (read_l2arc_header)
+ dump_l2arc_header(fd);
+
cookie = NULL;
while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
umem_free(node, sizeof (cksum_record_t));
diff --git a/configure.ac b/configure.ac
index 370a1970f..8604cdaa5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -336,6 +336,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/no_space/Makefile
tests/zfs-tests/tests/functional/nopwrite/Makefile
tests/zfs-tests/tests/functional/online_offline/Makefile
+ tests/zfs-tests/tests/functional/persist_l2arc/Makefile
tests/zfs-tests/tests/functional/pool_checkpoint/Makefile
tests/zfs-tests/tests/functional/pool_names/Makefile
tests/zfs-tests/tests/functional/poolversion/Makefile
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 75c483918..f500e1d45 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -310,10 +310,14 @@ void arc_fini(void);
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
+ uint64_t check);
void l2arc_init(void);
void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
+void l2arc_spa_rebuild_start(spa_t *spa);
#ifndef _KERNEL
extern boolean_t arc_watch;
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 2468a4aac..928b72325 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -20,9 +20,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, Delphix. All rights reserved.
+ * Copyright (c) 2013, Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
*/
#ifndef _SYS_ARC_IMPL_H
@@ -176,6 +177,218 @@ typedef struct l1arc_buf_hdr {
abd_t *b_pabd;
} l1arc_buf_hdr_t;
+typedef enum l2arc_dev_hdr_flags_t {
+ L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
+} l2arc_dev_hdr_flags_t;
+
+/*
+ * Pointer used in persistent L2ARC (for pointing to log blocks).
+ */
+typedef struct l2arc_log_blkptr {
+ /*
+ * Offset of log block within the device, in bytes
+ */
+ uint64_t lbp_daddr;
+ /*
+ * Aligned payload size (in bytes) of the log block
+ */
+ uint64_t lbp_payload_asize;
+ /*
+ * Offset in bytes of the first buffer in the payload
+ */
+ uint64_t lbp_payload_start;
+ /*
+ * lbp_prop has the following format:
+ * * logical size (in bytes)
+ * * physical (compressed) size (in bytes)
+ * * compression algorithm (we always LZ4-compress l2arc logs)
+ * * checksum algorithm (used for lbp_cksum)
+ */
+ uint64_t lbp_prop;
+ zio_cksum_t lbp_cksum; /* checksum of log */
+} l2arc_log_blkptr_t;
+
+/*
+ * The persistent L2ARC device header.
+ * Byte order of magic determines whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_dev_hdr_phys {
+ uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
+ uint64_t dh_version; /* Persistent L2ARC version */
+
+ /*
+ * Global L2ARC device state and metadata.
+ */
+ uint64_t dh_spa_guid;
+ uint64_t dh_vdev_guid;
+ uint64_t dh_log_blk_ent; /* entries per log blk */
+ uint64_t dh_evict; /* evicted offset in bytes */
+ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
+ /*
+ * Used in zdb.c for determining if a log block is valid, in the same
+ * way that l2arc_rebuild() does.
+ */
+ uint64_t dh_start;
+ uint64_t dh_end;
+
+ /*
+ * Start of log block chain. [0] -> newest log, [1] -> one older (used
+ * for initiating prefetch).
+ */
+ l2arc_log_blkptr_t dh_start_lbps[2];
+ const uint64_t dh_pad[34]; /* pad to 512 bytes */
+ zio_eck_t dh_tail;
+} l2arc_dev_hdr_phys_t;
+CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
+
+/*
+ * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
+ */
+typedef struct l2arc_log_ent_phys {
+ dva_t le_dva; /* dva of buffer */
+ uint64_t le_birth; /* birth txg of buffer */
+ /*
+ * le_prop has the following format:
+ * * logical size (in bytes)
+ * * physical (compressed) size (in bytes)
+ * * compression algorithm
+ * * object type (used to restore arc_buf_contents_t)
+ * * protected status (used for encryption)
+ * * prefetch status (used in l2arc_read_done())
+ */
+ uint64_t le_prop;
+ uint64_t le_daddr; /* buf location on l2dev */
+ /*
+ * We pad the size of each entry to a power of 2 so that the size of
+ * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
+ * because of the L2ARC_SET_*SIZE macros.
+ */
+ const uint64_t le_pad[3]; /* pad to 64 bytes */
+} l2arc_log_ent_phys_t;
+
+#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
+
+/*
+ * A log block of up to 1022 ARC buffer log entries, chained into the
+ * persistent L2ARC metadata linked list. Byte order of magic determines
+ * whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_log_blk_phys {
+ uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
+ /*
+ * There are 2 chains (headed by dh_start_lbps[2]), and this field
+ * points back to the previous block in this chain. We alternate
+ * which chain we append to, so they are time-wise and offset-wise
+ * interleaved, but that is an optimization rather than for
+ * correctness.
+ */
+ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
+ /*
+ * Pad header section to 128 bytes
+ */
+ uint64_t lb_pad[7];
+ /* Payload */
+ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
+} l2arc_log_blk_phys_t; /* 64K total */
+
+/*
+ * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
+ * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
+ */
+CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
+ 1ULL << SPA_MINBLOCKSHIFT));
+CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
+CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
+
+/*
+ * These structures hold in-flight abd buffers for log blocks as they're being
+ * written to the L2ARC device.
+ */
+typedef struct l2arc_lb_abd_buf {
+ abd_t *abd;
+ list_node_t node;
+} l2arc_lb_abd_buf_t;
+
+/*
+ * These structures hold pointers to log blocks present on the L2ARC device.
+ */
+typedef struct l2arc_lb_ptr_buf {
+ l2arc_log_blkptr_t *lb_ptr;
+ list_node_t node;
+} l2arc_lb_ptr_buf_t;
+
+/* Macros for setting fields in le_prop and lbp_prop */
+#define L2BLK_GET_LSIZE(field) \
+ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_LSIZE(field, x) \
+ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_PSIZE(field) \
+ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_PSIZE(field, x) \
+ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_COMPRESS(field) \
+ BF64_GET((field), 32, SPA_COMPRESSBITS)
+#define L2BLK_SET_COMPRESS(field, x) \
+ BF64_SET((field), 32, SPA_COMPRESSBITS, x)
+#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
+#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
+#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
+#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
+#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
+#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
+#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
+#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
+
+#define PTR_SWAP(x, y) \
+ do { \
+ void *tmp = (x);\
+ x = y; \
+ y = tmp; \
+ _NOTE(CONSTCOND)\
+ } while (0)
+
+#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
+#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ zfs_refcount_t l2ad_alloc; /* allocated bytes */
+ /*
+ * Persistence-related stuff
+ */
+ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
+ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
+ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
+ int l2ad_log_ent_idx; /* index into cur log blk */
+ /* Number of bytes in current log block's payload */
+ uint64_t l2ad_log_blk_payload_asize;
+ /*
+ * Offset (in bytes) of the first buffer in current log block's
+ * payload.
+ */
+ uint64_t l2ad_log_blk_payload_start;
+ /* Flag indicating whether a rebuild is scheduled or is going on */
+ boolean_t l2ad_rebuild;
+ boolean_t l2ad_rebuild_cancel;
+ boolean_t l2ad_rebuild_began;
+ uint64_t l2ad_log_entries; /* entries per log blk */
+ uint64_t l2ad_evict; /* evicted offset in bytes */
+ /* List of pointers to log blocks present in the L2ARC device */
+ list_t l2ad_lbptr_list;
+} l2arc_dev_t;
+
/*
* Encrypted blocks will need to be stored encrypted on the L2ARC
* disk as they appear in the main pool. In order for this to work we
@@ -206,32 +419,19 @@ typedef struct arc_buf_hdr_crypt {
uint8_t b_mac[ZIO_DATA_MAC_LEN];
} arc_buf_hdr_crypt_t;
-typedef struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- kmutex_t l2ad_mtx; /* lock for buffer list */
- list_t l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
- zfs_refcount_t l2ad_alloc; /* allocated bytes */
-} l2arc_dev_t;
-
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
uint32_t b_hits;
-
list_node_t b_l2node;
} l2arc_buf_hdr_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ /* in-flight list of log blocks */
+ list_t l2wcb_abd_list;
} l2arc_write_callback_t;
struct arc_buf_hdr {
@@ -532,6 +732,71 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_psize;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_l2_hdr_size;
+ /*
+ * Number of L2ARC log blocks written. These are used for restoring the
+ * L2ARC. Updated during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_writes;
+ /*
+ * Moving average of the physical size of the L2ARC log blocks, in
+ * bytes. Updated during L2ARC rebuild and during writing of L2ARC
+ * log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_avg_size;
+ /*
+ * Moving average of the physical size of L2ARC restored data, in bytes,
+ * to the physical size of their metadata in ARC, in bytes.
+ * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_data_to_meta_ratio;
+ /*
+ * Number of times the L2ARC rebuild was successful for an L2ARC device.
+ */
+ kstat_named_t arcstat_l2_rebuild_success;
+ /*
+ * Number of times the L2ARC rebuild failed because the device header
+ * was in an unsupported format or corrupted.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors
+ * while reading a log block.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors when
+ * reading the device header.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
+ /*
+ * Number of L2ARC log blocks which failed to be restored due to
+ * checksum errors.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
+ /*
+ * Number of times the L2ARC rebuild was aborted due to low system
+ * memory.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+ /* Logical size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_size;
+ /*
+ * Number of L2ARC log entries (buffers) that were successfully
+ * restored in ARC.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ /*
+ * Number of L2ARC log entries (buffers) already cached in ARC. These
+ * were not restored again.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ /* Physical size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_psize;
+ /*
+ * Number of L2ARC log blocks that were restored successfully. Each
+ * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
+ */
+ kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
@@ -617,6 +882,10 @@ extern void arc_tuning_update(boolean_t);
extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
+/* used in zdb.c */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 477356aa7..f5aced0da 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -573,6 +573,11 @@ typedef enum zfs_key_location {
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
#define ZPL_VERSION_SA ZPL_VERSION_5
+/* Persistent L2ARC version */
+#define L2ARC_PERSISTENT_VERSION_1 1ULL
+#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1
+#define L2ARC_PERSISTENT_VERSION_STRING "1"
+
/* Rewind policy information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index e1fab3c69..6e844f5ee 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -787,6 +787,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
+#define SPA_ASYNC_L2CACHE_REBUILD 0x800
/*
* Controls the behavior of spa_vdev_remove().
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 88c4c645b..6c5f61836 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -38,6 +38,7 @@
#include <libzfs.h>
#include <libzfs_impl.h>
#include <libzutil.h>
+#include <sys/arc_impl.h>
/*
* Returns true if the named pool matches the given GUID.
@@ -146,8 +147,10 @@ zpool_clear_label(int fd)
struct stat64 statbuf;
int l;
vdev_label_t *label;
+ l2arc_dev_hdr_phys_t *l2dhdr;
uint64_t size;
- int labels_cleared = 0;
+ int labels_cleared = 0, header_cleared = 0;
+ boolean_t clear_l2arc_header = B_FALSE;
if (fstat64_blk(fd, &statbuf) == -1)
return (0);
@@ -157,8 +160,13 @@ zpool_clear_label(int fd)
if ((label = calloc(1, sizeof (vdev_label_t))) == NULL)
return (-1);
+ if ((l2dhdr = calloc(1, sizeof (l2arc_dev_hdr_phys_t))) == NULL) {
+ free(label);
+ return (-1);
+ }
+
for (l = 0; l < VDEV_LABELS; l++) {
- uint64_t state, guid;
+ uint64_t state, guid, l2cache;
nvlist_t *config;
if (pread64(fd, label, sizeof (vdev_label_t),
@@ -185,6 +193,15 @@ zpool_clear_label(int fd)
continue;
}
+ /* If the device is a cache device clear the header. */
+ if (!clear_l2arc_header) {
+ if (nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
+ l2cache == POOL_STATE_L2CACHE) {
+ clear_l2arc_header = B_TRUE;
+ }
+ }
+
nvlist_free(config);
/*
@@ -202,7 +219,17 @@ zpool_clear_label(int fd)
}
}
+ /* Clear the L2ARC header. */
+ if (clear_l2arc_header) {
+ memset(l2dhdr, 0, sizeof (l2arc_dev_hdr_phys_t));
+ if (pwrite64(fd, l2dhdr, sizeof (l2arc_dev_hdr_phys_t),
+ VDEV_LABEL_START_SIZE) == sizeof (l2arc_dev_hdr_phys_t)) {
+ header_cleared++;
+ }
+ }
+
free(label);
+ free(l2dhdr);
if (labels_cleared == 0)
return (-1);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index a7623ff27..40666c8f3 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -87,7 +87,7 @@ Default value: \fB10\fR%.
.ad
.RS 12n
Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction
-of the target arc size.
+of the target ARC size.
.sp
Default value: \fB5\fR.
.RE
@@ -99,7 +99,7 @@ Default value: \fB5\fR.
.ad
.RS 12n
Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR,
-to a log2 fraction of the target arc size.
+to a log2 fraction of the target ARC size.
.sp
Default value: \fB6\fR.
.RE
@@ -179,7 +179,10 @@ Default value: \fB1\fR.
.ad
.RS 12n
How far through the ARC lists to search for L2ARC cacheable content, expressed
-as a multiplier of \fBl2arc_write_max\fR
+as a multiplier of \fBl2arc_write_max\fR.
+ARC persistence across reboots can be achieved with persistent L2ARC by setting
+this parameter to \fB0\fR allowing the full length of ARC lists to be searched
+for cacheable content.
.sp
Default value: \fB2\fR.
.RE
@@ -203,7 +206,7 @@ Default value: \fB200\fR%.
.ad
.RS 12n
Do not write buffers to L2ARC if they were prefetched but not used by
-applications
+applications.
.sp
Use \fB1\fR for yes (default) and \fB0\fR to disable.
.RE
@@ -214,7 +217,7 @@ Use \fB1\fR for yes (default) and \fB0\fR to disable.
\fBl2arc_norw\fR (int)
.ad
.RS 12n
-No reads during writes
+No reads during writes.
.sp
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE
@@ -237,7 +240,7 @@ Default value: \fB8,388,608\fR.
\fBl2arc_write_max\fR (ulong)
.ad
.RS 12n
-Max write bytes per interval
+Max write bytes per interval.
.sp
Default value: \fB8,388,608\fR.
.RE
@@ -245,6 +248,36 @@ Default value: \fB8,388,608\fR.
.sp
.ne 2
.na
+\fBl2arc_rebuild_enabled\fR (int)
+.ad
+.RS 12n
+Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be
+disabled if there are problems importing a pool or attaching an L2ARC device
+(e.g. the L2ARC device is slow in reading stored log metadata, or the metadata
+has become somehow fragmented/unusable).
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
+.sp
+.ne 2
+.na
+\fBl2arc_rebuild_blocks_min_l2size\fR (ulong)
+.ad
+.RS 12n
+Min size (in bytes) of an L2ARC device required in order to write log blocks
+in it. The log blocks are used upon importing the pool to rebuild
+the L2ARC (persistent L2ARC). Rationale: for L2ARC devices less than 1GB, the
+amount of data l2arc_evict() evicts is significant compared to the amount of
+restored L2ARC data. In this case do not write log blocks in L2ARC in order not
+to waste space.
+.sp
+Default value: \fB1,073,741,824\fR (1GB).
+.RE
+
+.sp
+.ne 2
+.na
\fBmetaslab_aliquot\fR (ulong)
.ad
.RS 12n
@@ -614,7 +647,7 @@ Default value: \fB1\fR.
.ad
.RS 12n
Sets the maximum number of bytes to consume during pool import to the log2
-fraction of the target arc size.
+fraction of the target ARC size.
.sp
Default value: \fB4\fR.
.RE
@@ -963,7 +996,7 @@ Default value: \fB1\fR.
\fBzfs_arc_min\fR (ulong)
.ad
.RS 12n
-Min arc size of ARC in bytes. If set to 0 then arc_c_min will default to
+Min size of ARC in bytes. If set to 0 then arc_c_min will default to
consuming the larger of 32M or 1/32 of total system memory.
.sp
Default value: \fB0\fR.
@@ -1088,7 +1121,7 @@ Default value: \fB0\fR.
Percent of pagecache to reclaim arc to
This tunable allows ZFS arc to play more nicely with the kernel's LRU
-pagecache. It can guarantee that the arc size won't collapse under scanning
+pagecache. It can guarantee that the ARC size won't collapse under scanning
pressure on the pagecache, yet still allows arc to be reclaimed down to
zfs_arc_min if necessary. This value is specified as percent of pagecache
size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index 8506d5478..3915be3f8 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -212,18 +212,24 @@ If specified multiple times, display counts of each intent log transaction type.
Examine the checkpointed state of the pool.
Note, the on disk format of the pool is not reverted to the checkpointed state.
.It Fl l Ar device
-Read the vdev labels from the specified device.
+Read the vdev labels and L2ARC header from the specified device.
.Nm Fl l
will return 0 if valid label was found, 1 if error occurred, and 2 if no valid
-labels were found. Each unique configuration is displayed only once.
+labels were found. The presence of L2ARC header is indicated by a specific
+sequence (L2ARC_DEV_HDR_MAGIC). Each unique configuration is displayed only
+once.
.It Fl ll Ar device
-In addition display label space usage stats.
+In addition display label space usage stats. If a valid L2ARC header was found
+also display the properties of log blocks used for restoring L2ARC contents
+(persistent L2ARC).
.It Fl lll Ar device
-Display every configuration, unique or not.
+Display every configuration, unique or not. If a valid L2ARC header was found
+also display the properties of log entries in log blocks used for restoring
+L2ARC contents (persistent L2ARC).
.Pp
If the
.Fl q
-option is also specified, don't print the labels.
+option is also specified, don't print the labels or the L2ARC header.
.Pp
If the
.Fl u
diff --git a/man/man8/zpool-labelclear.8 b/man/man8/zpool-labelclear.8
index 52638bdf7..ef6b92e82 100644
--- a/man/man8/zpool-labelclear.8
+++ b/man/man8/zpool-labelclear.8
@@ -48,7 +48,10 @@
.Xc
Removes ZFS label information from the specified
.Ar device .
-The
+If the
+.Ar device
+is a cache device, it also removes the L2ARC header
+(persistent L2ARC). The
.Ar device
must not be part of an active pool configuration.
.Bl -tag -width Ds
diff --git a/man/man8/zpoolconcepts.8 b/man/man8/zpoolconcepts.8
index 60845eef2..c2fc34971 100644
--- a/man/man8/zpoolconcepts.8
+++ b/man/man8/zpoolconcepts.8
@@ -323,8 +323,28 @@ If a read error is encountered on a cache device, that read I/O is reissued to
the original storage pool device, which might be part of a mirrored or raidz
configuration.
.Pp
-The content of the cache devices is considered volatile, as is the case with
-other system caches.
+The content of the cache devices is persistent across reboots and restored
+asynchronously when importing the pool in L2ARC (persistent L2ARC).
+This can be disabled by setting
+.Sy l2arc_rebuild_enabled = 0 .
+For cache devices smaller than 1GB we do not write the metadata structures
+required for rebuilding the L2ARC in order not to waste space. This can be
+changed with
+.Sy l2arc_rebuild_blocks_min_l2size .
+The cache device header (512 bytes) is updated even if no metadata structures
+are written. Setting
+.Sy l2arc_headroom = 0
+will result in scanning the full-length ARC lists for cacheable content to be
+written in L2ARC (persistent ARC). If a cache device is added with
+.Nm zpool Cm add
+its label and header will be overwritten and its contents are not going to be
+restored in L2ARC, even if the device was previously part of the pool. If a
+cache device is onlined with
+.Nm zpool Cm online
+its contents will be restored in L2ARC. This is useful in case of memory pressure
+where the contents of the cache device are not fully restored in L2ARC.
+The user can off/online the cache device when there is less memory pressure
+in order to fully restore its contents to L2ARC.
.Ss Pool checkpoint
Before starting critical procedures that include destructive actions (e.g
.Nm zfs Cm destroy
diff --git a/module/os/linux/zfs/zfs_sysfs.c b/module/os/linux/zfs/zfs_sysfs.c
index bb7f3b69a..fb7c68987 100644
--- a/module/os/linux/zfs/zfs_sysfs.c
+++ b/module/os/linux/zfs/zfs_sysfs.c
@@ -353,13 +353,14 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
* This list is intended for kernel features that don't have a pool feature
* association or that extend existing user kernel interfaces.
*
- * A user processes can easily check if the running zfs kernel module
+ * A user process can easily check if the running zfs kernel module
* supports the new feature.
*/
static const char *zfs_kernel_features[] = {
/* --> Add new kernel features here */
"com.delphix:vdev_initialize",
"org.zfsonlinux:vdev_trim",
+ "org.openzfs:l2arc_persistent",
};
#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features)
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index b5d17431c..74bfbfc70 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -21,10 +21,11 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
- * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2019, Delphix. All rights reserved.
+ * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2019, loli10K <[email protected]>. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
*/
/*
@@ -528,6 +529,20 @@ arc_stats_t arc_stats = {
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
+ { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_success", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_size", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_psize", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "memory_direct_count", KSTAT_DATA_UINT64 },
{ "memory_indirect_count", KSTAT_DATA_UINT64 },
@@ -582,6 +597,24 @@ arc_stats_t arc_stats = {
} \
}
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR 3
+#define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru_ghost;
@@ -805,6 +838,9 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
+static kmutex_t l2arc_rebuild_thr_lock;
+static kcondvar_t l2arc_rebuild_thr_cv;
+
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
@@ -816,6 +852,7 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
+static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -825,6 +862,58 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
+/*
+ * Performance tuning of L2ARC persistence:
+ *
+ * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
+ * an L2ARC device (either at pool import or later) will attempt
+ * to rebuild L2ARC buffer contents.
+ * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
+ * whether log blocks are written to the L2ARC device. If the L2ARC
+ * device is less than 1GB, the amount of data l2arc_evict()
+ * evicts is significant compared to the amount of restored L2ARC
+ * data. In this case do not write log blocks in L2ARC in order
+ * not to waste space.
+ */
+int l2arc_rebuild_enabled = B_TRUE;
+unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+
+/* L2ARC persistence rebuild control routines. */
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+
+/* L2ARC persistence read I/O routines. */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io);
+static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
+ const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
+static void l2arc_log_blk_fetch_abort(zio_t *zio);
+
+/* L2ARC persistence block restoration routines. */
+static void l2arc_log_blk_restore(l2arc_dev_t *dev,
+ const l2arc_log_blk_phys_t *lb, uint64_t lb_psize, uint64_t lb_daddr);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+ l2arc_dev_t *dev);
+
+/* L2ARC persistence write I/O routines. */
+static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+
+/* L2ARC persistence auxilliary routines. */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+ const arc_buf_hdr_t *ab);
+boolean_t l2arc_range_check_overlap(uint64_t bottom,
+ uint64_t top, uint64_t check);
+static void l2arc_blk_fetch_done(zio_t *zio);
+static inline uint64_t
+ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
/*
* We use Cityhash for this. It's fast, and has good hash properties without
@@ -1584,6 +1673,42 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
}
/*
+ * Allocates an ARC buf header that's in an evicted & L2-cached state.
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc.
+ */
+arc_buf_hdr_t *
+arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
+ dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
+ enum zio_compress compress, boolean_t protected, boolean_t prefetch)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(size != 0);
+ hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
+ hdr->b_birth = birth;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
+ HDR_SET_LSIZE(hdr, size);
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+ if (protected)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+ if (prefetch)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+
+ hdr->b_dva = dva;
+
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = daddr;
+
+ return (hdr);
+}
+
+/*
* Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
*/
static uint64_t
@@ -7463,6 +7588,103 @@ arc_fini(void)
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
+ *
+ * L2ARC persistence:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
+ * which is an additional piece of metadata which describes what's been
+ * written. This allows us to rebuild the arc_buf_hdr_t structures of the
+ * main ARC buffers. There are 2 linked-lists of log blocks headed by
+ * dh_start_lbps[2]. We alternate which chain we append to, so they are
+ * time-wise and offset-wise interleaved, but that is an optimization rather
+ * than for correctness. The log block also includes a pointer to the
+ * previous block in its chain.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ * for our header bookkeeping purposes. This contains a device header,
+ * which contains our top-level reference structures. We update it each
+ * time we write a new log block, so that we're able to locate it in the
+ * L2ARC device. If this write results in an inconsistent device header
+ * (e.g. due to power failure), we detect this by verifying the header's
+ * checksum and simply fail to reconstruct the L2ARC after reboot.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | ___two newest log block pointers__.__________ |
+ * | / \dh_start_lbps[1] |
+ * | / \ \dh_start_lbps[0]|
+ * |.___/__. V V |
+ * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * || hdr| ^ /^ /^ / / |
+ * |+------+ ...--\-------/ \-----/--\------/ / |
+ * | \--------------/ \--------------/ |
+ * +======================================================================+
+ *
+ * As can be seen on the diagram, rather than using a simple linked list,
+ * we use a pair of linked lists with alternating elements. This is a
+ * performance enhancement due to the fact that we only find out the
+ * address of the next log block access once the current block has been
+ * completely read in. Obviously, this hurts performance, because we'd be
+ * keeping the device's I/O queue at only a 1 operation deep, thus
+ * incurring a large amount of I/O round-trip latency. Having two lists
+ * allows us to fetch two log blocks ahead of where we are currently
+ * rebuilding L2ARC buffers.
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block: l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
+ * ^ ^^^^^^^^^___________________________________
+ * | \
+ * <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
+ *
+ * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
+ * hand are not restored. This is done by saving the offset (in bytes)
+ * l2arc_evict() has evicted to in the L2ARC device header and taking it
+ * into account when restoring buffers.
*/
static boolean_t
@@ -7508,10 +7730,12 @@ l2arc_write_size(l2arc_dev_t *dev)
* iteration can occur.
*/
dev_size = dev->l2ad_end - dev->l2ad_start;
- if (size >= dev_size) {
+ if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
- "exceeds the size of the cache device (guid %llu), "
- "resetting them to the default (%d)",
+ "plus the overhead of log blocks (persistent L2ARC, "
+ "%llu bytes) exceeds the size of the cache device "
+ "(guid %llu), resetting them to the default (%d)",
+ l2arc_log_blk_overhead(size, dev),
dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
@@ -7584,10 +7808,10 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev));
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev))
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
next = NULL;
l2arc_dev_last = next;
@@ -7636,12 +7860,14 @@ l2arc_do_free_on_write(void)
static void
l2arc_write_done(zio_t *zio)
{
- l2arc_write_callback_t *cb;
- l2arc_dev_t *dev;
- list_t *buflist;
- arc_buf_hdr_t *head, *hdr, *hdr_prev;
- kmutex_t *hash_lock;
- int64_t bytes_dropped = 0;
+ l2arc_write_callback_t *cb;
+ l2arc_lb_abd_buf_t *abd_buf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ int64_t bytes_dropped = 0;
cb = zio->io_private;
ASSERT3P(cb, !=, NULL);
@@ -7738,12 +7964,33 @@ top:
mutex_exit(hash_lock);
}
+ /*
+ * Free the allocated abd buffers for writing the log blocks.
+ * If the zio failed reclaim the allocated space and remove the
+ * pointers to these log blocks from the log block pointer list
+ * of the L2ARC device.
+ */
+ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
+ abd_free(abd_buf->abd);
+ zio_buf_free(abd_buf, sizeof (*abd_buf));
+ if (zio->io_error != 0) {
+ lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
+ bytes_dropped +=
+ L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+ list_destroy(&cb->l2wcb_abd_list);
+
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
mutex_exit(&dev->l2ad_mtx);
+ ASSERT(dev->l2ad_vdev != NULL);
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
l2arc_do_free_on_write();
@@ -8029,8 +8276,31 @@ l2arc_sublist_lock(int list_num)
}
/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
+{
+ if (dev->l2ad_dev_hdr->dh_log_blk_ent == 0) {
+ return (0);
+ } else {
+ uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
+
+ uint64_t log_blocks = (log_entries +
+ dev->l2ad_dev_hdr->dh_log_blk_ent - 1) /
+ dev->l2ad_dev_hdr->dh_log_blk_ent;
+
+ return (vdev_psize_to_asize(dev->l2ad_vdev,
+ sizeof (l2arc_log_blk_phys_t)) * log_blocks);
+ }
+}
+
+/*
* Evict buffers from the device write hand to the distance specified in
- * bytes. This distance may span populated buffers, it may span nothing.
+ * bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
@@ -8042,19 +8312,25 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
kmutex_t *hash_lock;
uint64_t taddr;
boolean_t rerun;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
buflist = &dev->l2ad_buflist;
+ /*
+ * We need to add in the worst case scenario of log block overhead.
+ */
+ distance += l2arc_log_blk_overhead(distance, dev);
+
top:
rerun = B_FALSE;
if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
/*
* When there is no space to accomodate upcoming writes,
- * evict to the end. Then bump the write hand to the start
- * and iterate. This iteration does not happen indefinitely
- * as we make sure in l2arc_write_size() that when l2ad_hand
- * is reset, the write size does not exceed the end of the
- * device.
+ * evict to the end. Then bump the write and evict hands
+ * to the start and iterate. This iteration does not
+ * happen indefinitely as we make sure in
+ * l2arc_write_size() that when the write hand is reset,
+ * the write size does not exceed the end of the device.
*/
rerun = B_TRUE;
taddr = dev->l2ad_end;
@@ -8064,16 +8340,57 @@ top:
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
+ /*
+ * This check has to be placed after deciding whether to iterate
+ * (rerun).
+ */
if (!all && dev->l2ad_first) {
/*
- * This is the first sweep through the device. There is
+ * This is the first sweep through the device. There is
* nothing to evict.
*/
goto out;
}
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand from the header of
+ * the device. Of note, l2arc_evict() does not actually delete buffers
+ * from the cache device, but keeping track of the evict hand will be
+ * useful when TRIM is implemented.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+
retry:
mutex_enter(&dev->l2ad_mtx);
+ /*
+ * We have to account for evicted log blocks. Run vdev_space_update()
+ * on log blocks whose offset (in bytes) is before the evicted offset
+ * (in bytes) by searching in the list of pointers to log blocks
+ * present in the L2ARC device.
+ */
+ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
+ lb_ptr_buf = lb_ptr_buf_prev) {
+
+ lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
+
+ /*
+ * We don't worry about log blocks left behind (ie
+ * lbp_daddr + psize < l2ad_hand) because l2arc_write_buffers()
+ * will never write more than l2arc_evict() evicts.
+ */
+ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
+ break;
+ } else {
+ vdev_space_update(dev->l2ad_vdev,
+ -L2BLK_GET_PSIZE(
+ (lb_ptr_buf->lb_ptr)->lbp_prop), 0, 0);
+ list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
@@ -8105,7 +8422,7 @@ retry:
ASSERT(!HDR_L2_WRITING(hdr));
ASSERT(!HDR_L2_WRITE_HEAD(hdr));
- if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
+ if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
@@ -8144,12 +8461,17 @@ retry:
mutex_exit(&dev->l2ad_mtx);
out:
- if (rerun) {
+ /*
+ * We need to check if we evict all buffers, otherwise we may iterate
+ * unnecessarily.
+ */
+ if (!all && rerun) {
/*
* Bump device hand to the device start if it is approaching the
* end. l2arc_evict() has already evicted ahead for this case.
*/
dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
goto top;
}
@@ -8272,6 +8594,17 @@ error:
return (ret);
}
+static void
+l2arc_blk_fetch_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+
+ cb = zio->io_private;
+ if (cb->l2rcb_abd != NULL)
+ abd_put(cb->l2rcb_abd);
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
/*
* Find and write ARC buffers to the L2ARC device.
*
@@ -8281,17 +8614,18 @@ error:
* state between calls to this function.
*
* Returns the number of bytes actually written (which may be smaller than
- * the delta by which the device hand has changed due to alignment).
+ * the delta by which the device hand has changed due to alignment and the
+ * writing of log blocks).
*/
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
- arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_psize, write_lsize, headroom;
- boolean_t full;
- l2arc_write_callback_t *cb;
- zio_t *pio, *wzio;
- uint64_t guid = spa_load_guid(spa);
+ arc_buf_hdr_t *hdr, *hdr_prev, *head;
+ uint64_t write_asize, write_psize, write_lsize, headroom;
+ boolean_t full;
+ l2arc_write_callback_t *cb = NULL;
+ zio_t *pio, *wzio;
+ uint64_t guid = spa_load_guid(spa);
ASSERT3P(dev->l2ad_vdev, !=, NULL);
@@ -8343,7 +8677,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
}
passed_sz += HDR_GET_LSIZE(hdr);
- if (passed_sz > headroom) {
+ if (l2arc_headroom != 0 && passed_sz > headroom) {
/*
* Searched too far.
*/
@@ -8443,6 +8777,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
+ list_create(&cb->l2wcb_abd_list,
+ sizeof (l2arc_lb_abd_buf_t),
+ offsetof(l2arc_lb_abd_buf_t, node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}
@@ -8477,6 +8814,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
mutex_exit(hash_lock);
+ /*
+ * Append buf info to current log and commit if full.
+ * arcstat_l2_{size,asize} kstats are updated
+ * internally.
+ */
+ if (l2arc_log_blk_insert(dev, hdr))
+ l2arc_log_blk_commit(dev, pio, cb);
+
zio_nowait(wzio);
}
@@ -8491,6 +8836,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT0(write_lsize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
+
+ /*
+ * Although we did not write any buffers l2ad_evict may
+ * have advanced.
+ */
+ l2arc_dev_hdr_update(dev);
+
return (0);
}
@@ -8500,6 +8852,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
ARCSTAT_INCR(arcstat_l2_psize, write_psize);
+ l2arc_dev_hdr_update(dev);
+
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
@@ -8611,7 +8965,17 @@ l2arc_feed_thread(void *unused)
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
- l2arc_dev_t *dev;
+ return (l2arc_vdev_get(vd) != NULL);
+}
+
+/*
+ * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
+ * the vdev_t isn't an L2ARC device.
+ */
+static l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
@@ -8621,7 +8985,7 @@ l2arc_vdev_present(vdev_t *vd)
}
mutex_exit(&l2arc_dev_mtx);
- return (dev != NULL);
+ return (dev);
}
/*
@@ -8631,22 +8995,29 @@ l2arc_vdev_present(vdev_t *vd)
void
l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
- l2arc_dev_t *adddev;
+ l2arc_dev_t *adddev;
+ uint64_t l2dhdr_asize;
ASSERT(!l2arc_vdev_present(vd));
/*
* Create a new l2arc device entry.
*/
- adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ /* leave extra size for an l2arc device header */
+ l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
+ MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+ ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
list_link_init(&adddev->l2ad_node);
+ adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
@@ -8656,6 +9027,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
+ /*
+ * This is a list of pointers to log blocks that are still present
+ * on the device.
+ */
+ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
+ offsetof(l2arc_lb_ptr_buf_t, node));
+
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
zfs_refcount_create(&adddev->l2ad_alloc);
@@ -8666,6 +9044,89 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Decide if vdev is eligible for L2ARC rebuild
+ */
+ l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
+}
+
+void
+l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
+{
+ l2arc_dev_t *dev = NULL;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ uint64_t l2dhdr_asize;
+ spa_t *spa;
+ int err;
+ boolean_t rebuild = B_TRUE;
+
+ dev = l2arc_vdev_get(vd);
+ ASSERT3P(dev, !=, NULL);
+ spa = dev->l2ad_spa;
+ l2dhdr = dev->l2ad_dev_hdr;
+ l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+
+ /*
+ * The L2ARC has to hold at least the payload of one log block for
+ * them to be restored (persistent L2ARC). The payload of a log block
+ * depends on the amount of its log entries. We always write log blocks
+ * with 1022 entries. How many of them are committed or restored depends
+ * on the size of the L2ARC device. Thus the maximum payload of
+ * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
+ * is less than that, we reduce the amount of committed and restored
+ * log entries per block so as to enable persistence.
+ */
+ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
+ dev->l2ad_log_entries = 0;
+ } else {
+ dev->l2ad_log_entries = MIN((dev->l2ad_end -
+ dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
+ L2ARC_LOG_BLK_MAX_ENTRIES);
+ }
+
+ /*
+ * Read the device header, if an error is returned do not rebuild L2ARC.
+ */
+ if ((err = l2arc_dev_hdr_read(dev)) != 0)
+ rebuild = B_FALSE;
+
+ if (rebuild && l2dhdr->dh_log_blk_ent > 0) {
+ /*
+ * If we are onlining a cache device (vdev_reopen) that was
+ * still present (l2arc_vdev_present()) and rebuild is enabled,
+ * we should evict all ARC buffers and pointers to log blocks
+ * and reclaim their space before restoring its contents to
+ * L2ARC.
+ */
+ if (reopen) {
+ if (!l2arc_rebuild_enabled) {
+ return;
+ } else {
+ l2arc_evict(dev, 0, B_TRUE);
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+ }
+ }
+ /*
+ * Just mark the device as pending for a rebuild. We won't
+ * be starting a rebuild in line here as it would block pool
+ * import. Instead spa_load_impl will hand that off to an
+ * async task which will call l2arc_spa_rebuild_start.
+ */
+ dev->l2ad_rebuild = B_TRUE;
+ } else if (!rebuild && spa_writeable(spa)) {
+ /*
+ * The boolean rebuild is false if reading the device header
+ * returned an error. In this case create a new header. We
+ * zero out the memory holding the header to reset
+ * dh_start_lbps.
+ */
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
}
/*
@@ -8674,24 +9135,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
void
l2arc_remove_vdev(vdev_t *vd)
{
- l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+ l2arc_dev_t *remdev = NULL;
/*
* Find the device by vdev
*/
- mutex_enter(&l2arc_dev_mtx);
- for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
- nextdev = list_next(l2arc_dev_list, dev);
- if (vd == dev->l2ad_vdev) {
- remdev = dev;
- break;
- }
- }
+ remdev = l2arc_vdev_get(vd);
ASSERT3P(remdev, !=, NULL);
/*
+ * Cancel any ongoing or scheduled rebuild.
+ */
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (remdev->l2ad_rebuild_began == B_TRUE) {
+ remdev->l2ad_rebuild_cancel = B_TRUE;
+ while (remdev->l2ad_rebuild == B_TRUE)
+ cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ /*
* Remove device from global list
*/
+ mutex_enter(&l2arc_dev_mtx);
list_remove(l2arc_dev_list, remdev);
l2arc_dev_last = NULL; /* may have been invalidated */
atomic_dec_64(&l2arc_ndev);
@@ -8702,9 +9168,12 @@ l2arc_remove_vdev(vdev_t *vd)
*/
l2arc_evict(remdev, 0, B_TRUE);
list_destroy(&remdev->l2ad_buflist);
+ ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
+ list_destroy(&remdev->l2ad_lbptr_list);
mutex_destroy(&remdev->l2ad_mtx);
zfs_refcount_destroy(&remdev->l2ad_alloc);
- kmem_free(remdev, sizeof (l2arc_dev_t));
+ kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
+ vmem_free(remdev, sizeof (l2arc_dev_t));
}
void
@@ -8717,6 +9186,8 @@ l2arc_init(void)
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -8741,6 +9212,8 @@ l2arc_fini(void)
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_rebuild_thr_lock);
+ cv_destroy(&l2arc_rebuild_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
@@ -8772,6 +9245,865 @@ l2arc_stop(void)
mutex_exit(&l2arc_feed_thr_lock);
}
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called after pool import from the spa async thread, since starting
+ * these threads directly from spa_import() will make them part of the
+ * "zpool import" context and delay process exit (and thus pool import).
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off rebuild threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ l2arc_dev_t *dev =
+ l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+ if (dev == NULL) {
+ /* Don't attempt a rebuild if the vdev is UNAVAIL */
+ continue;
+ }
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild_began = B_TRUE;
+ (void) thread_create(NULL, 0,
+ (void (*)(void *))l2arc_dev_rebuild_start,
+ dev, 0, &p0, TS_RUN, minclsyspri);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ }
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_start(l2arc_dev_t *dev)
+{
+ VERIFY(!dev->l2ad_rebuild_cancel);
+ VERIFY(dev->l2ad_rebuild);
+ (void) l2arc_rebuild(dev);
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ dev->l2ad_rebuild_began = B_FALSE;
+ dev->l2ad_rebuild = B_FALSE;
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ * starts reading the log block chain and restores each block's contents
+ * to memory (reconstructing arc_buf_hdr_t's).
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log block chain.
+ * 2) We encounter *any* error condition (cksum errors, io errors)
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+ vdev_t *vd = dev->l2ad_vdev;
+ spa_t *spa = vd->vdev_spa;
+ int i = 0, err = 0;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ l2arc_log_blk_phys_t *this_lb, *next_lb;
+ zio_t *this_io = NULL, *next_io = NULL;
+ l2arc_log_blkptr_t lbps[2];
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ boolean_t lock_held;
+
+ this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+ next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+
+ /*
+ * We prevent device removal while issuing reads to the device,
+ * then during the rebuilding phases we drop this lock again so
+ * that a spa_unload or device remove can be initiated - this is
+ * safe, because the spa will signal us to stop before removing
+ * our device and wait for us to stop.
+ */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ lock_held = B_TRUE;
+
+ /*
+ * Retrieve the persistent L2ARC device state.
+ */
+ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
+ dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
+ dev->l2ad_start);
+ dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ /*
+ * In case the zfs module parameter l2arc_rebuild_enabled is false
+ * we do not start the rebuild process.
+ */
+ if (!l2arc_rebuild_enabled)
+ goto out;
+
+ /* Prepare the rebuild process */
+ bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+
+ /* Start the rebuild process */
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
+ break;
+
+ if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
+ this_lb, next_lb, this_io, &next_io)) != 0)
+ goto out;
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to rebuild the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata log blocks, so the user may choose to offline/
+ * online the L2ARC dev at a later time (or re-import the pool)
+ * to reconstruct it (when there's less memory pressure).
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ err = SET_ERROR(ENOMEM);
+ goto out;
+ }
+
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ lock_held = B_FALSE;
+
+ /*
+ * Now that we know that the next_lb checks out alright, we
+ * can start reconstruction from this log block.
+ */
+ l2arc_log_blk_restore(dev, this_lb,
+ L2BLK_GET_PSIZE((&lbps[0])->lbp_prop),
+ lbps[0].lbp_daddr);
+ i++;
+
+ /*
+ * log block restored, include its pointer in the list of
+ * pointers to log blocks present in the L2ARC device.
+ */
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
+ KM_SLEEP);
+ bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(vd,
+ L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), 0, 0);
+
+ /*
+ * Protection against loops of log blocks:
+ *
+ * l2ad_hand l2ad_evict
+ * V V
+ * l2ad_start |=======================================| l2ad_end
+ * -----|||----|||---|||----|||
+ * (3) (2) (1) (0)
+ * ---|||---|||----|||---|||
+ * (7) (6) (5) (4)
+ *
+ * In this situation the pointer of log block (4) passes
+ * l2arc_log_blkptr_valid() but the log block should not be
+ * restored as it is overwritten by the payload of log block
+ * (0). Only log blocks (0)-(3) should be restored. We check
+ * whether l2ad_evict lies in between the next log block
+ * offset (lbps[1].lbp_daddr) and the present log block offset
+ * (lbps[0].lbp_daddr). If true and this isn't the first pass,
+ * we are looping from the beginning and we should stop.
+ */
+ if (l2arc_range_check_overlap(lbps[1].lbp_daddr,
+ lbps[0].lbp_daddr, dev->l2ad_evict) && !dev->l2ad_first)
+ goto out;
+
+ for (;;) {
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild = B_FALSE;
+ cv_signal(&l2arc_rebuild_thr_cv);
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ err = SET_ERROR(ECANCELED);
+ goto out;
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ if (spa_config_tryenter(spa, SCL_L2ARC, vd,
+ RW_READER)) {
+ lock_held = B_TRUE;
+ break;
+ }
+ /*
+ * L2ARC config lock held by somebody in writer,
+ * possibly due to them trying to remove us. They'll
+ * likely to want us to shut down, so after a little
+ * delay, we check l2ad_rebuild_cancel and retry
+ * the lock again.
+ */
+ delay(1);
+ }
+
+ /*
+ * Continue with the next log block.
+ */
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb->lb_prev_lbp;
+ PTR_SWAP(this_lb, next_lb);
+ this_io = next_io;
+ next_io = NULL;
+ }
+
+ if (this_io != NULL)
+ l2arc_log_blk_fetch_abort(this_io);
+out:
+ if (next_io != NULL)
+ l2arc_log_blk_fetch_abort(next_io);
+ vmem_free(this_lb, sizeof (*this_lb));
+ vmem_free(next_lb, sizeof (*next_lb));
+
+ if (!l2arc_rebuild_enabled) {
+ zfs_dbgmsg("L2ARC rebuild disabled");
+ } else if (err == 0 && i > 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_success);
+ zfs_dbgmsg("L2ARC successfully rebuilt, "
+ "restored %d blocks", i);
+ } else if (err != 0) {
+ zfs_dbgmsg("L2ARC rebuild aborted, "
+ "restored %d blocks", i);
+ }
+
+ if (lock_held)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+
+ return (err);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `hdr'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev)
+{
+ int err;
+ uint64_t guid;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
+ ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_SPECULATIVE, B_FALSE));
+
+ abd_put(abd);
+
+ if (err != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ return (err);
+ }
+
+ if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
+
+ if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
+ l2dhdr->dh_spa_guid != guid ||
+ l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
+ l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
+ l2dhdr->dh_log_blk_ent != dev->l2ad_log_entries ||
+ l2dhdr->dh_end != dev->l2ad_end ||
+ !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
+ l2dhdr->dh_evict)) {
+ /*
+ * Attempt to rebuild a device containing no actual dev hdr
+ * or containing a header from some other pool or from another
+ * version of persistent L2ARC.
+ */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple fetcher to make sure that while
+ * we're processing one buffer the L2ARC is already fetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log block
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
+ *
+ * The `this_io' and `next_io' arguments are used for block fetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * fetched IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no fetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the fetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of fetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io)
+{
+ int err = 0;
+ zio_cksum_t cksum;
+ abd_t *abd = NULL;
+
+ ASSERT(this_lbp != NULL && next_lbp != NULL);
+ ASSERT(this_lb != NULL && next_lb != NULL);
+ ASSERT(next_io != NULL && *next_io == NULL);
+ ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
+
+ /*
+ * Check to see if we have issued the IO for this log block in a
+ * previous run. If not, this is the first call, so issue it now.
+ */
+ if (this_io == NULL) {
+ this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
+ this_lb);
+ }
+
+ /*
+ * Peek to see if we can start issuing the next IO immediately.
+ */
+ if (l2arc_log_blkptr_valid(dev, next_lbp)) {
+ /*
+ * Start issuing IO for the next log block early - this
+ * should help keep the L2ARC device busy while we
+ * decompress and restore this log block.
+ */
+ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
+ next_lb);
+ }
+
+ /* Wait for the IO to read this log block to complete */
+ if ((err = zio_wait(this_io)) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
+ "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
+ dev->l2ad_vdev->vdev_guid);
+ goto cleanup;
+ }
+
+ /* Make sure the buffer checks out */
+ fletcher_4_native(this_lb,
+ L2BLK_GET_PSIZE((this_lbp)->lbp_prop), NULL, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
+ zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
+ "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
+ this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
+ dev->l2ad_hand, dev->l2ad_evict);
+ err = SET_ERROR(ECKSUM);
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ case ZIO_COMPRESS_LZ4:
+ abd = abd_alloc_for_io(L2BLK_GET_PSIZE(
+ (this_lbp)->lbp_prop), B_TRUE);
+ abd_copy_from_buf_off(abd, this_lb, 0,
+ L2BLK_GET_PSIZE((this_lbp)->lbp_prop));
+ if ((err = zio_decompress_data(
+ L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
+ abd, this_lb, L2BLK_GET_PSIZE((this_lbp)->lbp_prop),
+ sizeof (*this_lb))) != 0) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(this_lb, sizeof (*this_lb));
+ if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+cleanup:
+ /* Abort an in-flight fetch I/O in case of error */
+ if (err != 0 && *next_io != NULL) {
+ l2arc_log_blk_fetch_abort(*next_io);
+ *next_io = NULL;
+ }
+ if (abd != NULL)
+ abd_free(abd);
+ return (err);
+}
+
+/*
+ * Restores the payload of a log block to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
+ uint64_t lb_psize, uint64_t lb_daddr)
+{
+ uint64_t size = 0, psize = 0;
+ uint64_t log_entries = dev->l2ad_dev_hdr->dh_log_blk_ent;
+
+ for (int i = log_entries - 1; i >= 0; i--) {
+ /*
+ * Restore goes in the reverse temporal direction to preserve
+ * correct temporal ordering of buffers in the l2ad_buflist.
+ * l2arc_hdr_restore also does a list_insert_tail instead of
+ * list_insert_head on the l2ad_buflist:
+ *
+ * LIST l2ad_buflist LIST
+ * HEAD <------ (time) ------ TAIL
+ * direction +-----+-----+-----+-----+-----+ direction
+ * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
+ * fill +-----+-----+-----+-----+-----+
+ * ^ ^
+ * | |
+ * | |
+ * l2arc_fill_thread l2arc_rebuild
+ * places new bufs here restores bufs here
+ *
+ * This also works when the restored bufs get evicted at any
+ * point during the rebuild.
+ */
+ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
+ psize += L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop);
+ l2arc_hdr_restore(&lb->lb_entries[i], dev);
+ }
+
+ /*
+ * Record rebuild stats:
+ * size Logical size of restored buffers in the L2ARC
+ * psize Physical size of restored buffers in the L2ARC
+ */
+ ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
+{
+ arc_buf_hdr_t *hdr, *exists;
+ kmutex_t *hash_lock;
+ arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop);
+ uint64_t asize;
+
+ /*
+ * Do all the allocation before grabbing any locks, this lets us
+ * sleep if memory is full and we don't have to deal with failed
+ * allocations.
+ */
+ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
+ dev, le->le_dva, le->le_daddr,
+ L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
+ L2BLK_GET_COMPRESS((le)->le_prop),
+ L2BLK_GET_PROTECTED((le)->le_prop),
+ L2BLK_GET_PREFETCH((le)->le_prop));
+ asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((le)->le_prop));
+
+ /*
+ * vdev_space_update() has to be called before arc_hdr_destroy() to
+ * avoid underflow since the latter also calls the former.
+ */
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr));
+
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, hdr);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* Buffer was already cached, no need to restore it. */
+ arc_hdr_destroy(hdr);
+ /*
+ * If the buffer is already cached, check whether it has
+ * L2ARC metadata. If not, enter them and update the flag.
+ * This is important is case of onlining a cache device, since
+ * we previously evicted all L2ARC metadata from ARC.
+ */
+ if (!HDR_HAS_L2HDR(exists)) {
+ arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
+ exists->b_l2hdr.b_dev = dev;
+ exists->b_l2hdr.b_daddr = le->le_daddr;
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, exists);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(exists), exists);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists));
+ ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists));
+ }
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ }
+
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
+ l2arc_log_blk_phys_t *lb)
+{
+ uint32_t psize;
+ zio_t *pio;
+ l2arc_read_callback_t *cb;
+
+ psize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
+ cb->l2rcb_abd = abd_get_from_buf(lb, psize);
+ pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize,
+ cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+ return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_fetch_abort(zio_t *zio)
+{
+ (void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device. The zio is
+ * initiated as a child of `pio'.
+ */
+static void
+l2arc_dev_hdr_update(l2arc_dev_t *dev)
+{
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+ int err;
+
+ l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
+ l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
+ l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
+ l2dhdr->dh_log_blk_ent = dev->l2ad_log_entries;
+ l2dhdr->dh_evict = dev->l2ad_evict;
+ l2dhdr->dh_start = dev->l2ad_start;
+ l2dhdr->dh_end = dev->l2ad_end;
+ l2dhdr->dh_flags = 0;
+ if (dev->l2ad_first)
+ l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
+ NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
+
+ abd_put(abd);
+
+ if (err != 0) {
+ zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ }
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ uint64_t psize, asize;
+ zio_t *wzio;
+ l2arc_lb_abd_buf_t *abd_buf;
+ uint8_t *tmpbuf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+
+ VERIFY3S(dev->l2ad_log_ent_idx, ==, l2dhdr->dh_log_blk_ent);
+
+ tmpbuf = zio_buf_alloc(sizeof (*lb));
+ abd_buf = zio_buf_alloc(sizeof (*abd_buf));
+ abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
+
+ /* link the buffer into the block chain */
+ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
+ lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+ /* try to compress the buffer */
+ list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
+ psize = zio_compress_data(ZIO_COMPRESS_LZ4,
+ abd_buf->abd, tmpbuf, sizeof (*lb));
+
+ /* a log block is never entirely zero */
+ ASSERT(psize != 0);
+ asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ ASSERT(asize <= sizeof (*lb));
+
+ /*
+ * Update the start log block pointer in the device header to point
+ * to the log block we're about to write.
+ */
+ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
+ l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
+ l2dhdr->dh_start_lbps[0].lbp_payload_asize =
+ dev->l2ad_log_blk_payload_asize;
+ l2dhdr->dh_start_lbps[0].lbp_payload_start =
+ dev->l2ad_log_blk_payload_start;
+ _NOTE(CONSTCOND)
+ L2BLK_SET_LSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
+ L2BLK_SET_PSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
+ L2BLK_SET_CHECKSUM(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_CHECKSUM_FLETCHER_4);
+ if (asize < sizeof (*lb)) {
+ /* compression succeeded */
+ bzero(tmpbuf + psize, asize - psize);
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_LZ4);
+ } else {
+ /* compression failed */
+ bcopy(lb, tmpbuf, sizeof (*lb));
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_OFF);
+ }
+
+ /* checksum what we're about to write */
+ fletcher_4_native(tmpbuf, asize, NULL,
+ &l2dhdr->dh_start_lbps[0].lbp_cksum);
+
+ abd_put(abd_buf->abd);
+
+ /* perform the write itself */
+ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
+ abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+ asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ dev->l2ad_hand += asize;
+ /*
+ * Include the committed log block's pointer in the list of pointers
+ * to log blocks present in the L2ARC device.
+ */
+ bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ /* bump the kstats */
+ ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+ dev->l2ad_log_blk_payload_asize / asize);
+
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+}
+
+/*
+ * Validates an L2ARC log block address to make sure that it can be read
+ * from the provided L2ARC device.
+ */
+boolean_t
+l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
+{
+ uint64_t psize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ uint64_t end = lbp->lbp_daddr + psize - 1;
+ uint64_t start = lbp->lbp_payload_start;
+ boolean_t evicted = B_FALSE;
+
+ /*
+ * A log block is valid if all of the following conditions are true:
+ * - it fits entirely (including its payload) between l2ad_start and
+ * l2ad_end
+ * - it has a valid size
+ * - neither the log block itself nor part of its payload was evicted
+ * by l2arc_evict():
+ *
+ * l2ad_hand l2ad_evict
+ * | | lbp_daddr
+ * | start | | end
+ * | | | | |
+ * V V V V V
+ * l2ad_start ============================================ l2ad_end
+ * --------------------------||||
+ * ^ ^
+ * | log block
+ * payload
+ */
+
+ evicted =
+ l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
+ l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
+
+ return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
+ psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
+ (!evicted || dev->l2ad_first));
+}
+
+/*
+ * Inserts ARC buffer header `hdr' into the current L2ARC log block on
+ * the device. The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log block is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_log_ent_phys_t *le;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+
+ if (l2dhdr->dh_log_blk_ent == 0)
+ return (B_FALSE);
+
+ int index = dev->l2ad_log_ent_idx++;
+
+ ASSERT3S(index, <, l2dhdr->dh_log_blk_ent);
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ le = &lb->lb_entries[index];
+ bzero(le, sizeof (*le));
+ le->le_dva = hdr->b_dva;
+ le->le_birth = hdr->b_birth;
+ le->le_daddr = hdr->b_l2hdr.b_daddr;
+ if (index == 0)
+ dev->l2ad_log_blk_payload_start = le->le_daddr;
+ L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
+ L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
+ L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
+ L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
+ L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
+ L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
+
+ dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ HDR_GET_PSIZE(hdr));
+
+ return (dev->l2ad_log_ent_idx == l2dhdr->dh_log_blk_ent);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ * bottom -- Lower end of the range to check (written to earlier).
+ * top -- Upper end of the range to check (written to later).
+ * check -- The address for which we want to determine if it sits in
+ * between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ * bottom < top : Sequentially ordered case:
+ * <check>--------+-------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |---------------<bottom>============<top>--------------|
+ *
+ * bottom > top: Looped-around case:
+ * <check>--------+------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |===============<top>---------------<bottom>===========|
+ * ^ ^
+ * | (or here?) |
+ * +---------------+---------<check>
+ *
+ * top == bottom : Just a single address comparison.
+ */
+boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+ if (bottom < top)
+ return (bottom <= check && check <= top);
+ else if (bottom > top)
+ return (check <= top || bottom <= check);
+ else
+ return (check == top);
+}
+
EXPORT_SYMBOL(arc_buf_size);
EXPORT_SYMBOL(arc_write);
EXPORT_SYMBOL(arc_read);
@@ -8861,6 +10193,12 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
"No reads during writes");
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
+ "Rebuild the L2ARC when importing a pool");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
+ "Min size in bytes to write rebuild log blocks in L2ARC");
+
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0d4646c15..aface90af 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -4860,6 +4860,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
}
spa_import_progress_remove(spa_guid(spa));
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+
spa_load_note(spa, "LOADED");
return (0);
@@ -7986,6 +7988,17 @@ spa_async_thread(void *arg)
}
/*
+ * Kick off L2 cache rebuilding.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
+ l2arc_spa_rebuild_start(spa);
+ spa_config_exit(spa, SCL_L2ARC, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
* Let the world know that we're done.
*/
mutex_enter(&spa->spa_async_lock);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index fb0d02eea..59147ce31 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2279,9 +2279,22 @@ vdev_reopen(vdev_t *vd)
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
- vd->vdev_aux == &spa->spa_l2cache &&
- !l2arc_vdev_present(vd))
- l2arc_add_vdev(spa, vd);
+ vd->vdev_aux == &spa->spa_l2cache) {
+ /*
+ * When reopening we can assume the device label has
+ * already the attribute l2cache_persistent, since we've
+ * opened the device in the past and updated the label.
+ * In case the vdev is present we should evict all ARC
+ * buffers and pointers to log blocks and reclaim their
+ * space before restoring its contents to L2ARC.
+ */
+ if (l2arc_vdev_present(vd)) {
+ l2arc_rebuild_vdev(vd, B_TRUE);
+ } else {
+ l2arc_add_vdev(spa, vd);
+ }
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+ }
} else {
(void) vdev_validate(vd);
}
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 897a6a955..61df6d420 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -164,3 +164,9 @@ tags = ['functional', 'user_namespace']
tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos',
'userquota_013_pos', 'userspace_003_pos']
tags = ['functional', 'userquota']
+
+[tests/functional/persist_l2arc:Linux]
+tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos',
+ 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos',
+ 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos']
+tags = ['functional', 'persist_l2arc']
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 62d335abe..680fcf42c 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -36,6 +36,8 @@ INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size
INITIALIZE_VALUE initialize_value zfs_initialize_value
KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export
L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch
+L2ARC_REBUILD_BLOCKS_MIN_L2SIZE UNSUPPORTED l2arc_rebuild_blocks_min_l2size
+L2ARC_REBUILD_ENABLED UNSUPPORTED l2arc_rebuild_enabled
L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost
L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max
LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index bd484b2da..776222f08 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -45,6 +45,7 @@ SUBDIRS = \
no_space \
nopwrite \
online_offline \
+ persist_l2arc \
pool_checkpoint \
pool_names \
poolversion \
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am b/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am
new file mode 100644
index 000000000..14a43de9c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am
@@ -0,0 +1,15 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/persist_l2arc
+dist_pkgdata_SCRIPTS = \
+ cleanup.ksh \
+ setup.ksh \
+ persist_l2arc_001_pos.ksh \
+ persist_l2arc_002_pos.ksh \
+ persist_l2arc_003_neg.ksh \
+ persist_l2arc_004_pos.ksh \
+ persist_l2arc_005_pos.ksh \
+ persist_l2arc_006_pos.ksh \
+ persist_l2arc_007_pos.ksh \
+ persist_l2arc_008_pos.ksh
+
+dist_pkgdata_DATA = \
+ persist_l2arc.cfg
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh
new file mode 100755
index 000000000..828de3862
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh
@@ -0,0 +1,31 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+verify_runnable "global"
+
+if datasetexists $TESTPOOL ; then
+ log_must zpool destroy -f $TESTPOOL
+fi
+
+log_must rm -rf $VDIR
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg
new file mode 100644
index 000000000..60bb24637
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg
@@ -0,0 +1,37 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+export SIZE=1G
+export VDIR=$TESTDIR/disk.persist_l2arc
+export VDEV="$VDIR/a"
+export VDEV_CACHE="$VDIR/b"
+
+# fio options
+export DIRECTORY=/$TESTPOOL
+export NUMJOBS=4
+export RUNTIME=30
+export PERF_RANDSEED=1234
+export PERF_COMPPERCENT=66
+export PERF_COMPCHUNK=0
+export BLOCKSIZE=128K
+export SYNC_TYPE=0
+export DIRECT=1
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh
new file mode 100755
index 000000000..b202fac40
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh
@@ -0,0 +1,106 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+#
+# DESCRIPTION:
+# Persistent L2ARC with an unencrypted ZFS file system succeeds
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Export and re-import pool without writing any data.
+# 3. Create a random file in that pool and random read for 30 sec.
+# 4. Export pool.
+# 5. Read the amount of log blocks written from the header of the
+# L2ARC device.
+# 6. Import pool.
+# 7. Read the amount of log blocks rebuilt in arcstats and compare to
+# (4).
+# 8. Check if the labels of the L2ARC device are intact.
+#
+# * We can predict the minimum bytes of L2ARC restored if we subtract
+# from the effective size of the cache device the bytes l2arc_evict()
+# evicts:
+# l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize
+# wr_sz: l2arc_write_max + l2arc_write_boost (worst case)
+# blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) *
+# sizeof (l2arc_log_blk_phys_t)
+# min restored size: l2 - (wr_sz + blk_overhead)
+#
+
+verify_runnable "global"
+
+log_assert "Persistent L2ARC with an unencrypted ZFS file system succeeds."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+ log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \
+ $rebuild_blocks_min_l2size
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( floor($fill_mb / 2) ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must zpool export $TESTPOOL
+log_must zpool import -d $VDIR $TESTPOOL
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool export $TESTPOOL
+
+typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool import -d $VDIR $TESTPOOL
+
+sleep 2
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+log_must test $l2_dh_log_blk -gt 0
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Persistent L2ARC with an unencrypted ZFS file system succeeds."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh
new file mode 100755
index 000000000..ae219e01a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh
@@ -0,0 +1,112 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib
+
+#
+# DESCRIPTION:
+# Persistent L2ARC with an encrypted ZFS file system succeeds
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Create a an encrypted ZFS file system.
+# 3. Create a random file in the encyrpted file system and random
+# read for 30 sec.
+# 4. Export pool.
+# 5. Read the amount of log blocks written from the header of the
+# L2ARC device.
+# 5. Import pool.
+# 6. Mount the encypted ZFS file system.
+# 7. Read the amount of log blocks rebuilt in arcstats and compare to
+# (5).
+# 8. Check if the labels of the L2ARC device are intact.
+#
+# * We can predict the minimum bytes of L2ARC restored if we subtract
+# from the effective size of the cache device the bytes l2arc_evict()
+# evicts:
+# l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize
+# wr_sz: l2arc_write_max + l2arc_write_boost (worst case)
+# blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) *
+# sizeof (l2arc_log_blk_phys_t)
+# min restored size: l2 - (wr_sz + blk_overhead)
+#
+
+verify_runnable "global"
+
+log_assert "Persistent L2ARC with an encrypted ZFS file system succeeds."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+ log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \
+ $rebuild_blocks_min_l2size
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( floor($fill_mb / 2) ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
+ "-o keyformat=passphrase $TESTPOOL/$TESTFS1"
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool export $TESTPOOL
+
+sleep 2
+
+typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool import -d $VDIR $TESTPOOL
+log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1"
+
+sleep 2
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+log_must test $l2_dh_log_blk -gt 0
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Persistent L2ARC with an encrypted ZFS file system succeeds."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh
new file mode 100755
index 000000000..7824dfe8f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh
@@ -0,0 +1,87 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+#
+# DESCRIPTION:
+# Persistent L2ARC fails as expected when L2ARC_REBUILD_ENABLED = 0
+#
+# STRATEGY:
+# 1. Set L2ARC_REBUILD_ENABLED = 0
+# 2. Create pool with a cache device.
+# 3. Create a random file in that pool and random read for 30 sec.
+# 4. Export pool.
+# 5. Import pool.
+# 6. Check in zpool iostat if the cache device has space allocated.
+# 7. Read the file written in (2) and check if l2_hits in
+# /proc/spl/kstat/zfs/arcstats increased.
+#
+
+verify_runnable "global"
+
+log_assert "Persistent L2ARC fails as expected when L2ARC_REBUILD_ENABLED = 0."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_REBUILD_ENABLED $rebuild_enabled
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+# disable L2ARC rebuild
+typeset rebuild_enabled=$(get_tunable L2ARC_REBUILD_ENABLED)
+log_must set_tunable32 L2ARC_REBUILD_ENABLED 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( 2 * $fill_mb ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool export $TESTPOOL
+
+typeset l2_success_start=$(get_arcstat l2_rebuild_success)
+
+log_must zpool import -d $VDIR $TESTPOOL
+log_mustnot test "$(zpool iostat -Hpv $TESTPOOL $VDEV_CACHE | awk '{print $2}')" -gt 80000000
+
+typeset l2_success_end=$(get_arcstat l2_rebuild_success)
+
+log_mustnot test $l2_success_end -gt $l2_success_start
+
+log_must zpool destroy -f $TESTPOOL
+log_must set_tunable32 L2ARC_REBUILD_ENABLED $rebuild_enabled
+
+log_pass "Persistent L2ARC fails as expected when L2ARC_REBUILD_ENABLED = 0."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh
new file mode 100755
index 000000000..6620131d1
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh
@@ -0,0 +1,101 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+#
+# DESCRIPTION:
+# Persistent L2ARC restores all written log blocks
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Create a random file in that pool, smaller than the cache device
+# and random read for 30 sec.
+# 3. Export pool.
+# 4. Read amount of log blocks written.
+# 5. Import pool.
+# 6. Read amount of log blocks built.
+# 7. Compare the two amounts
+# 8. Read the file written in (2) and check if l2_hits in
+# /proc/spl/kstat/zfs/arcstats increased.
+# 9. Check if the labels of the L2ARC device are intact.
+#
+
+verify_runnable "global"
+
+log_assert "Persistent L2ARC restores all written log blocks."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( 2 * $fill_mb ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+typeset log_blk_start=$(get_arcstat l2_log_blk_writes)
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool export $TESTPOOL
+
+sleep 2
+
+typeset log_blk_end=$(get_arcstat l2_log_blk_writes)
+
+typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool import -d $VDIR $TESTPOOL
+
+typeset l2_hits_start=$(get_arcstat l2_hits)
+
+export RUNTIME=10
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+typeset l2_hits_end=$(get_arcstat l2_hits)
+
+typeset log_blk_rebuild_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \
+ $(( $log_blk_end - $log_blk_start ))
+
+log_must test $l2_hits_end -gt $l2_hits_start
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Persistent L2ARC restores all written log blocks."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh
new file mode 100755
index 000000000..b2cad9d1f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh
@@ -0,0 +1,108 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib
+
+#
+# DESCRIPTION:
+# Persistent L2ARC restores all written log blocks with encryption
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Create a an encrypted ZFS file system.
+# 3. Create a random file in the entrypted file system,
+# smaller than the cache device, and random read for 30 sec.
+# 4. Export pool.
+# 5. Read amount of log blocks written.
+# 6. Import pool.
+# 7. Mount the encypted ZFS file system.
+# 8. Read amount of log blocks built.
+# 9. Compare the two amounts
+# 10. Read the file written in (3) and check if l2_hits in
+# /proc/spl/kstat/zfs/arcstats increased.
+# 11. Check if the labels of the L2ARC device are intact.
+#
+
+verify_runnable "global"
+
+log_assert "Persistent L2ARC restores all written log blocks with encryption."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( 2 * $fill_mb ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+typeset log_blk_start=$(get_arcstat l2_log_blk_writes)
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
+ "-o keyformat=passphrase $TESTPOOL/$TESTFS1"
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool export $TESTPOOL
+
+sleep 2
+
+typeset log_blk_end=$(get_arcstat l2_log_blk_writes)
+
+typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool import -d $VDIR $TESTPOOL
+log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1"
+
+typeset l2_hits_start=$(get_arcstat l2_hits)
+
+export RUNTIME=10
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+typeset l2_hits_end=$(get_arcstat l2_hits)
+
+typeset log_blk_rebuild_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \
+ $(( $log_blk_end - $log_blk_start ))
+
+log_must test $l2_hits_end -gt $l2_hits_start
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Persistent L2ARC restores all written log blocks with encryption."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh
new file mode 100755
index 000000000..55e9f9585
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh
@@ -0,0 +1,98 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+#
+# DESCRIPTION:
+# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not
+# present.
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Create a random file in that pool and random read for 30 sec.
+# 3. Read the amount of log blocks written from the header of the
+# L2ARC device.
+# 4. Offline the L2ARC device and export pool.
+# 5. Import pool and online the L2ARC device.
+# 6. Read the amount of log blocks rebuilt in arcstats and compare to
+# (3).
+# 7. Check if the labels of the L2ARC device are intact.
+#
+
+verify_runnable "global"
+
+log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+ log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \
+ $rebuild_blocks_min_l2size
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( floor($fill_mb / 2) ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool offline $TESTPOOL $VDEV_CACHE
+log_must zpool export $TESTPOOL
+
+sleep 5
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+log_must zpool import -d $VDIR $TESTPOOL
+log_must zpool online $TESTPOOL $VDEV_CACHE
+
+sleep 5
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+log_must test $l2_dh_log_blk -gt 0
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh
new file mode 100755
index 000000000..e3c983be8
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh
@@ -0,0 +1,95 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+#
+# DESCRIPTION:
+# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present.
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Create a random file in that pool and random read for 30 sec.
+# 3. Read the amount of log blocks written from the header of the
+# L2ARC device.
+# 4. Offline the L2ARC device.
+# 5. Online the L2ARC device.
+# 6. Read the amount of log blocks rebuilt in arcstats and compare to
+# (3).
+# 7. Check if the labels of the L2ARC device are intact.
+#
+
+verify_runnable "global"
+
+log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+ log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \
+ $rebuild_blocks_min_l2size
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0
+
+typeset fill_mb=800
+typeset cache_sz=$(( floor($fill_mb / 2) ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool offline $TESTPOOL $VDEV_CACHE
+
+sleep 5
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+log_must zpool online $TESTPOOL $VDEV_CACHE
+
+sleep 5
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+log_must test $l2_dh_log_blk -gt 0
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh
new file mode 100755
index 000000000..a64bd94d3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh
@@ -0,0 +1,143 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+#
+# DESCRIPTION:
+# Off/onlining an L2ARC device restores all written blocks, vdev present.
+#
+# STRATEGY:
+# 1. Create pool with a cache device.
+# 2. Create a random file in that pool and random read for 30 sec.
+# 3. Read the amount of log blocks written from the header of the
+# L2ARC device.
+# 4. Offline the L2ARC device.
+# 5. Online the L2ARC device.
+# 6. Read the amount of log blocks rebuilt in arcstats and compare to
+# (3).
+# 7. Create another random file in that pool and random read for 30 sec.
+# 8. Read the amount of log blocks written from the header of the
+# L2ARC device.
+# 9. Offline the L2ARC device.
+# 10. Online the L2ARC device.
+# 11. Read the amount of log blocks rebuilt in arcstats and compare to
+# (7).
+# 12. Check if the amount of log blocks on the cache device has
+# increased.
+# 13. Export the pool.
+# 14. Read the amount of log blocks on the cache device.
+# 15. Import the pool.
+# 16. Read the amount of log blocks rebuilt in arcstats and compare to
+# (14).
+# 17. Check if the labels of the L2ARC device are intact.
+#
+
+verify_runnable "global"
+
+log_assert "Off/onlining an L2ARC device restores all written blocks , vdev present."
+
+function cleanup
+{
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+}
+log_onexit cleanup
+
+# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+typeset fill_mb=400
+typeset cache_sz=$(( 3 * $fill_mb ))
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
+
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool offline $TESTPOOL $VDEV_CACHE
+
+sleep 2
+
+typeset l2_dh_log_blk1=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool online $TESTPOOL $VDEV_CACHE
+
+sleep 5
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk1 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+log_must test $l2_dh_log_blk1 -gt 0
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+log_must zpool offline $TESTPOOL $VDEV_CACHE
+
+sleep 2
+
+typeset l2_dh_log_blk2=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool online $TESTPOOL $VDEV_CACHE
+
+sleep 5
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk2 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+
+log_must test $l2_dh_log_blk2 -gt $l2_dh_log_blk1
+
+log_must zpool export $TESTPOOL
+
+typeset l2_dh_log_blk3=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
+ awk '{print $2}')
+
+typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
+
+log_must zpool import -d $VDIR $TESTPOOL
+
+sleep 5
+
+typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
+
+log_must test $l2_dh_log_blk3 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
+log_must test $l2_dh_log_blk3 -gt 0
+
+log_must zdb -lq $VDEV_CACHE
+
+log_must zpool destroy -f $TESTPOOL
+
+log_pass "Off/onlining an L2ARC device restores all written blocks, vdev present."
diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh
new file mode 100755
index 000000000..ef95c84cd
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh
@@ -0,0 +1,29 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg
+
+verify_runnable "global"
+
+log_must rm -rf $VDIR
+log_must mkdir -p $VDIR
+log_must mkfile $SIZE $VDEV
+
+log_pass