aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/arc.h27
-rw-r--r--include/sys/arc_impl.h95
-rw-r--r--include/sys/trace_arc.h54
3 files changed, 121 insertions, 55 deletions
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 25e2f0352..903f0b413 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -81,10 +81,29 @@ typedef enum arc_flags
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
- ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */
- ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */
- ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */
- ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */
+ ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
+ /* indicates that the buffer contains metadata (otherwise, data) */
+ ARC_FLAG_BUFC_METADATA = 1 << 16,
+
+ /* Flags specifying whether optional hdr struct fields are defined */
+ ARC_FLAG_HAS_L1HDR = 1 << 17,
+ ARC_FLAG_HAS_L2HDR = 1 << 18,
+
+ /*
+ * The arc buffer's compression mode is stored in the top 7 bits of the
+ * flags field, so these dummy flags are included so that MDB can
+ * interpret the enum properly.
+ */
+ ARC_FLAG_COMPRESS_0 = 1 << 24,
+ ARC_FLAG_COMPRESS_1 = 1 << 25,
+ ARC_FLAG_COMPRESS_2 = 1 << 26,
+ ARC_FLAG_COMPRESS_3 = 1 << 27,
+ ARC_FLAG_COMPRESS_4 = 1 << 28,
+ ARC_FLAG_COMPRESS_5 = 1 << 29,
+ ARC_FLAG_COMPRESS_6 = 1 << 30
+
} arc_flags_t;
struct arc_buf {
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 1f8351a67..556cc2583 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -74,8 +74,6 @@ typedef struct arc_state {
arc_state_type_t arcs_state;
} arc_state_t;
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -96,27 +94,45 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
- arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
- arc_flags_t b_flags;
uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
+ /* for waiting on writes to complete */
kcondvar_t b_cv;
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- uint64_t b_spa;
/* protected by arc state mutex */
arc_state_t *b_state;
@@ -133,9 +149,10 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
- l2arc_buf_hdr_t *b_l2hdr;
- list_node_t b_l2node;
-};
+ arc_callback_t *b_acb;
+ /* temporary buffer holder for in-flight compressed data */
+ void *b_tmp_cdata;
+} l1arc_buf_hdr_t;
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
@@ -146,15 +163,51 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ /* real alloc'd buffer size depending on b_compress applied */
+ uint32_t b_hits;
+ int32_t b_asize;
+
+ list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ /*
+ * Even though this checksum is only set/verified when a buffer is in
+ * the L1 cache, it needs to be in the set of common fields because it
+ * must be preserved from the time before a buffer is written out to
+ * L2ARC until after it is read back in.
+ */
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /* immutable */
+ int32_t b_size;
+ uint64_t b_spa;
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
+};
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h
index 8b885eff7..b9df228ea 100644
--- a/include/sys/trace_arc.h
+++ b/include/sys/trace_arc.h
@@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
- __field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_dva_word[0] = ab->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
__entry->hdr_birth = ab->b_birth;
- __entry->hdr_cksum0 = ab->b_cksum0;
__entry->hdr_flags = ab->b_flags;
- __entry->hdr_datacnt = ab->b_datacnt;
- __entry->hdr_type = ab->b_type;
+ __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt;
__entry->hdr_size = ab->b_size;
__entry->hdr_spa = ab->b_spa;
- __entry->hdr_state_type = ab->b_state->arcs_state;
- __entry->hdr_access = ab->b_arc_access;
- __entry->hdr_mru_hits = ab->b_mru_hits;
- __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits;
- __entry->hdr_mfu_hits = ab->b_mfu_hits;
- __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits;
- __entry->hdr_l2_hits = ab->b_l2_hits;
- __entry->hdr_refcount = ab->b_refcnt.rc_count;
+ __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state;
+ __entry->hdr_access = ab->b_l1hdr.b_arc_access;
+ __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits;
+ __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
+ __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
+ __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
+ __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits;
+ __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
),
- TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
+ TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x datacnt %u type %u size %llu spa %llu "
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
- __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
+ __entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type,
__entry->hdr_access, __entry->hdr_mru_hits,
@@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
- __field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
__entry->hdr_birth = hdr->b_birth;
- __entry->hdr_cksum0 = hdr->b_cksum0;
__entry->hdr_flags = hdr->b_flags;
- __entry->hdr_datacnt = hdr->b_datacnt;
- __entry->hdr_type = hdr->b_type;
+ __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt;
__entry->hdr_size = hdr->b_size;
__entry->hdr_spa = hdr->b_spa;
- __entry->hdr_state_type = hdr->b_state->arcs_state;
- __entry->hdr_access = hdr->b_arc_access;
- __entry->hdr_mru_hits = hdr->b_mru_hits;
- __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits;
- __entry->hdr_mfu_hits = hdr->b_mfu_hits;
- __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
- __entry->hdr_l2_hits = hdr->b_l2_hits;
- __entry->hdr_refcount = hdr->b_refcnt.rc_count;
+ __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state;
+ __entry->hdr_access = hdr->b_l1hdr.b_arc_access;
+ __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits;
+ __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
+ __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
+ __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
+ __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits;
+ __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];
__entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1];
@@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->zb_level = zb->zb_level;
__entry->zb_blkid = zb->zb_blkid;
),
- TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
- "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u "
+ TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
+ "flags 0x%x datacnt %u size %llu spa %llu state_type %u "
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
"lsize %llu } zb { objset %llu object %llu level %lli "
"blkid %llu }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
- __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
- __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
+ __entry->hdr_birth, __entry->hdr_flags,
+ __entry->hdr_datacnt, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,