aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
authorGeorge Amanakis <[email protected]>2020-04-10 13:33:35 -0400
committerGitHub <[email protected]>2020-04-10 10:33:35 -0700
commit77f6826b83b7e27f0996f6d192202c36f65e41fd (patch)
treeb9946c99348bf6742cc41739aeff1a2b952d9d2f /include/sys
parent36a6e2335c45212f2609269bcee3004908ac6bcb (diff)
Persistent L2ARC
This commit makes the L2ARC persistent across reboots. We implement a light-weight persistent L2ARC metadata structure that allows L2ARC contents to be recovered after a reboot. This significantly eases the impact a reboot has on read performance on systems with large caches. Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Co-authored-by: Saso Kiselkov <[email protected]> Co-authored-by: Jorgen Lundman <[email protected]> Co-authored-by: George Amanakis <[email protected]> Ported-by: Yuxuan Shui <[email protected]> Signed-off-by: George Amanakis <[email protected]> Closes #925 Closes #1823 Closes #2672 Closes #3744 Closes #9582
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/arc.h4
-rw-r--r--include/sys/arc_impl.h305
-rw-r--r--include/sys/fs/zfs.h5
-rw-r--r--include/sys/spa.h1
4 files changed, 297 insertions, 18 deletions
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 75c483918..f500e1d45 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -310,10 +310,14 @@ void arc_fini(void);
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
+ uint64_t check);
void l2arc_init(void);
void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
+void l2arc_spa_rebuild_start(spa_t *spa);
#ifndef _KERNEL
extern boolean_t arc_watch;
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 2468a4aac..928b72325 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -20,9 +20,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, Delphix. All rights reserved.
+ * Copyright (c) 2013, Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
*/
#ifndef _SYS_ARC_IMPL_H
@@ -176,6 +177,218 @@ typedef struct l1arc_buf_hdr {
abd_t *b_pabd;
} l1arc_buf_hdr_t;
+typedef enum l2arc_dev_hdr_flags_t {
+ L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
+} l2arc_dev_hdr_flags_t;
+
+/*
+ * Pointer used in persistent L2ARC (for pointing to log blocks).
+ */
+typedef struct l2arc_log_blkptr {
+ /*
+ * Offset of log block within the device, in bytes
+ */
+ uint64_t lbp_daddr;
+ /*
+ * Aligned payload size (in bytes) of the log block
+ */
+ uint64_t lbp_payload_asize;
+ /*
+ * Offset in bytes of the first buffer in the payload
+ */
+ uint64_t lbp_payload_start;
+ /*
+ * lbp_prop has the following format:
+ * * logical size (in bytes)
+ * * physical (compressed) size (in bytes)
+ * * compression algorithm (we always LZ4-compress l2arc logs)
+ * * checksum algorithm (used for lbp_cksum)
+ */
+ uint64_t lbp_prop;
+ zio_cksum_t lbp_cksum; /* checksum of log */
+} l2arc_log_blkptr_t;
+
+/*
+ * The persistent L2ARC device header.
+ * Byte order of magic determines whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_dev_hdr_phys {
+ uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
+ uint64_t dh_version; /* Persistent L2ARC version */
+
+ /*
+ * Global L2ARC device state and metadata.
+ */
+ uint64_t dh_spa_guid;
+ uint64_t dh_vdev_guid;
+ uint64_t dh_log_blk_ent; /* entries per log blk */
+ uint64_t dh_evict; /* evicted offset in bytes */
+ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
+ /*
+ * Used in zdb.c for determining if a log block is valid, in the same
+ * way that l2arc_rebuild() does.
+ */
+ uint64_t dh_start;
+ uint64_t dh_end;
+
+ /*
+ * Start of log block chain. [0] -> newest log, [1] -> one older (used
+ * for initiating prefetch).
+ */
+ l2arc_log_blkptr_t dh_start_lbps[2];
+ const uint64_t dh_pad[34]; /* pad to 512 bytes */
+ zio_eck_t dh_tail;
+} l2arc_dev_hdr_phys_t;
+CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
+
+/*
+ * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
+ */
+typedef struct l2arc_log_ent_phys {
+ dva_t le_dva; /* dva of buffer */
+ uint64_t le_birth; /* birth txg of buffer */
+ /*
+ * le_prop has the following format:
+ * * logical size (in bytes)
+ * * physical (compressed) size (in bytes)
+ * * compression algorithm
+ * * object type (used to restore arc_buf_contents_t)
+ * * protected status (used for encryption)
+ * * prefetch status (used in l2arc_read_done())
+ */
+ uint64_t le_prop;
+ uint64_t le_daddr; /* buf location on l2dev */
+ /*
+ * We pad the size of each entry to a power of 2 so that the size of
+ * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
+ * because of the L2ARC_SET_*SIZE macros.
+ */
+ const uint64_t le_pad[3]; /* pad to 64 bytes */
+} l2arc_log_ent_phys_t;
+
+#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
+
+/*
+ * A log block of up to 1022 ARC buffer log entries, chained into the
+ * persistent L2ARC metadata linked list. Byte order of magic determines
+ * whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_log_blk_phys {
+ uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
+ /*
+ * There are 2 chains (headed by dh_start_lbps[2]), and this field
+ * points back to the previous block in this chain. We alternate
+ * which chain we append to, so they are time-wise and offset-wise
+ * interleaved, but that is an optimization rather than for
+ * correctness.
+ */
+ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
+ /*
+ * Pad header section to 128 bytes
+ */
+ uint64_t lb_pad[7];
+ /* Payload */
+ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
+} l2arc_log_blk_phys_t; /* 64K total */
+
+/*
+ * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
+ * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
+ */
+CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
+ 1ULL << SPA_MINBLOCKSHIFT));
+CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
+CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
+
+/*
+ * These structures hold in-flight abd buffers for log blocks as they're being
+ * written to the L2ARC device.
+ */
+typedef struct l2arc_lb_abd_buf {
+ abd_t *abd;
+ list_node_t node;
+} l2arc_lb_abd_buf_t;
+
+/*
+ * These structures hold pointers to log blocks present on the L2ARC device.
+ */
+typedef struct l2arc_lb_ptr_buf {
+ l2arc_log_blkptr_t *lb_ptr;
+ list_node_t node;
+} l2arc_lb_ptr_buf_t;
+
+/* Macros for setting fields in le_prop and lbp_prop */
+#define L2BLK_GET_LSIZE(field) \
+ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_LSIZE(field, x) \
+ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_PSIZE(field) \
+ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_PSIZE(field, x) \
+ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_COMPRESS(field) \
+ BF64_GET((field), 32, SPA_COMPRESSBITS)
+#define L2BLK_SET_COMPRESS(field, x) \
+ BF64_SET((field), 32, SPA_COMPRESSBITS, x)
+#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
+#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
+#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
+#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
+#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
+#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
+#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
+#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
+
+#define PTR_SWAP(x, y) \
+ do { \
+ void *tmp = (x);\
+ x = y; \
+ y = tmp; \
+ _NOTE(CONSTCOND)\
+ } while (0)
+
+#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
+#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ zfs_refcount_t l2ad_alloc; /* allocated bytes */
+ /*
+ * Persistence-related stuff
+ */
+ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
+ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
+ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
+ int l2ad_log_ent_idx; /* index into cur log blk */
+ /* Number of bytes in current log block's payload */
+ uint64_t l2ad_log_blk_payload_asize;
+ /*
+ * Offset (in bytes) of the first buffer in current log block's
+ * payload.
+ */
+ uint64_t l2ad_log_blk_payload_start;
+ /* Flag indicating whether a rebuild is scheduled or is going on */
+ boolean_t l2ad_rebuild;
+ boolean_t l2ad_rebuild_cancel;
+ boolean_t l2ad_rebuild_began;
+ uint64_t l2ad_log_entries; /* entries per log blk */
+ uint64_t l2ad_evict; /* evicted offset in bytes */
+ /* List of pointers to log blocks present in the L2ARC device */
+ list_t l2ad_lbptr_list;
+} l2arc_dev_t;
+
/*
* Encrypted blocks will need to be stored encrypted on the L2ARC
* disk as they appear in the main pool. In order for this to work we
@@ -206,32 +419,19 @@ typedef struct arc_buf_hdr_crypt {
uint8_t b_mac[ZIO_DATA_MAC_LEN];
} arc_buf_hdr_crypt_t;
-typedef struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- kmutex_t l2ad_mtx; /* lock for buffer list */
- list_t l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
- zfs_refcount_t l2ad_alloc; /* allocated bytes */
-} l2arc_dev_t;
-
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
uint32_t b_hits;
-
list_node_t b_l2node;
} l2arc_buf_hdr_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ /* in-flight list of log blocks */
+ list_t l2wcb_abd_list;
} l2arc_write_callback_t;
struct arc_buf_hdr {
@@ -532,6 +732,71 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_psize;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_l2_hdr_size;
+ /*
+ * Number of L2ARC log blocks written. These are used for restoring the
+ * L2ARC. Updated during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_writes;
+ /*
+ * Moving average of the physical size of the L2ARC log blocks, in
+ * bytes. Updated during L2ARC rebuild and during writing of L2ARC
+ * log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_avg_size;
+ /*
+ * Moving average of the physical size of L2ARC restored data, in bytes,
+ * to the physical size of their metadata in ARC, in bytes.
+ * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_data_to_meta_ratio;
+ /*
+ * Number of times the L2ARC rebuild was successful for an L2ARC device.
+ */
+ kstat_named_t arcstat_l2_rebuild_success;
+ /*
+ * Number of times the L2ARC rebuild failed because the device header
+ * was in an unsupported format or corrupted.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors
+ * while reading a log block.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors when
+ * reading the device header.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
+ /*
+ * Number of L2ARC log blocks which failed to be restored due to
+ * checksum errors.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
+ /*
+ * Number of times the L2ARC rebuild was aborted due to low system
+ * memory.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+ /* Logical size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_size;
+ /*
+ * Number of L2ARC log entries (buffers) that were successfully
+ * restored in ARC.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ /*
+ * Number of L2ARC log entries (buffers) already cached in ARC. These
+ * were not restored again.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ /* Physical size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_psize;
+ /*
+ * Number of L2ARC log blocks that were restored successfully. Each
+ * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
+ */
+ kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
@@ -617,6 +882,10 @@ extern void arc_tuning_update(boolean_t);
extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
+/* used in zdb.c */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 477356aa7..f5aced0da 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -573,6 +573,11 @@ typedef enum zfs_key_location {
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
#define ZPL_VERSION_SA ZPL_VERSION_5
+/* Persistent L2ARC version */
+#define L2ARC_PERSISTENT_VERSION_1 1ULL
+#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1
+#define L2ARC_PERSISTENT_VERSION_STRING "1"
+
/* Rewind policy information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index e1fab3c69..6e844f5ee 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -787,6 +787,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
+#define SPA_ASYNC_L2CACHE_REBUILD 0x800
/*
* Controls the behavior of spa_vdev_remove().