diff options
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/arc.h | 27 | ||||
-rw-r--r-- | include/sys/arc_impl.h | 95 | ||||
-rw-r--r-- | include/sys/trace_arc.h | 54 |
3 files changed, 121 insertions, 55 deletions
diff --git a/include/sys/arc.h b/include/sys/arc.h index 25e2f0352..903f0b413 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -81,10 +81,29 @@ typedef enum arc_flags ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ - ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ - ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ + ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 16, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 17, + ARC_FLAG_HAS_L2HDR = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1f8351a67..556cc2583 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,8 +74,6 @@ typedef struct arc_state { arc_state_type_t arcs_state; } arc_state_t; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -96,27 +94,45 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -133,9 +149,10 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; +} l1arc_buf_hdr_t; typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ @@ -146,15 +163,51 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + uint32_t b_hits; + int32_t b_asize; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; #ifdef __cplusplus } #endif diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 8b885eff7..b9df228ea 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; - __entry->hdr_cksum0 = ab->b_cksum0; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_datacnt; - __entry->hdr_type = ab->b_type; + __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; __entry->hdr_size = ab->b_size; __entry->hdr_spa = ab->b_spa; - __entry->hdr_state_type = ab->b_state->arcs_state; - __entry->hdr_access = ab->b_arc_access; - __entry->hdr_mru_hits = ab->b_mru_hits; - __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits; - __entry->hdr_mfu_hits = ab->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits; - __entry->hdr_l2_hits = ab->b_l2_hits; - __entry->hdr_refcount = ab->b_refcnt.rc_count; + __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = ab->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x datacnt %u type %u size %llu spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, @@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; - __entry->hdr_cksum0 = hdr->b_cksum0; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_datacnt; - __entry->hdr_type = hdr->b_type; + __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; __entry->hdr_size = hdr->b_size; __entry->hdr_spa = hdr->b_spa; - __entry->hdr_state_type = hdr->b_state->arcs_state; - __entry->hdr_access = hdr->b_arc_access; - __entry->hdr_mru_hits = hdr->b_mru_hits; - __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits; - __entry->hdr_mfu_hits = hdr->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - __entry->hdr_l2_hits = hdr->b_l2_hits; - __entry->hdr_refcount = hdr->b_refcnt.rc_count; + __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = hdr->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; @@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " - "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x datacnt %u size %llu spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, |