diff options
author | Chris Williamson <[email protected]> | 2014-12-29 19:12:23 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2015-06-11 10:27:25 -0700 |
commit | b9541d6b7d765883f8a5fe7c1bde74df5c256ff6 (patch) | |
tree | db1403d12322dc1f49ed76e6a5e3e08c549a700b /include/sys | |
parent | 2a4324141f4a0811ba29dfef123fe5dad2ca1b03 (diff) |
Illumos 5408 - managing ZFS cache devices requires lots of RAM
5408 managing ZFS cache devices requires lots of RAM
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Don Brady <[email protected]>
Reviewed by: Josef 'Jeff' Sipek <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
Porting notes:
Due to the restructuring of the ARC-related structures, this
patch conflicts with at least the following existing ZoL commits:
6e1d7276c94cbd7c2e19f9232f6ba4bafa62dbe0
Fix inaccurate arcstat_l2_hdr_size calculations
The ARC_SPACE_HDRS constant no longer exists and has been
somewhat equivalently replaced by HDR_L2ONLY_SIZE.
e0b0ca983d6897bcddf05af2c0e5d01ff66f90db
Add visibility in to cached dbufs
The new layering of l{1,2}arc_buf_hdr_t within the arc_buf_hdr
struct requires additional structure member names to be used
when referencing the inner items. Also, the presence of L1 or L2
inner member is indicated by flags using the new HDR_HAS_L{1,2}HDR
macros.
Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/arc.h | 27 | ||||
-rw-r--r-- | include/sys/arc_impl.h | 95 | ||||
-rw-r--r-- | include/sys/trace_arc.h | 54 |
3 files changed, 121 insertions, 55 deletions
diff --git a/include/sys/arc.h b/include/sys/arc.h index 25e2f0352..903f0b413 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -81,10 +81,29 @@ typedef enum arc_flags ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ - ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ - ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ + ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 16, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 17, + ARC_FLAG_HAS_L2HDR = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1f8351a67..556cc2583 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,8 +74,6 @@ typedef struct arc_state { arc_state_type_t arcs_state; } arc_state_t; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -96,27 +94,45 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -133,9 +149,10 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; +} l1arc_buf_hdr_t; typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ @@ -146,15 +163,51 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + uint32_t b_hits; + int32_t b_asize; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; #ifdef __cplusplus } #endif diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 8b885eff7..b9df228ea 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; - __entry->hdr_cksum0 = ab->b_cksum0; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_datacnt; - __entry->hdr_type = ab->b_type; + __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; __entry->hdr_size = ab->b_size; __entry->hdr_spa = ab->b_spa; - __entry->hdr_state_type = ab->b_state->arcs_state; - __entry->hdr_access = ab->b_arc_access; - __entry->hdr_mru_hits = ab->b_mru_hits; - __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits; - __entry->hdr_mfu_hits = ab->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits; - __entry->hdr_l2_hits = ab->b_l2_hits; - __entry->hdr_refcount = ab->b_refcnt.rc_count; + __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = ab->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x datacnt %u type %u size %llu spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, @@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; - __entry->hdr_cksum0 = hdr->b_cksum0; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_datacnt; - __entry->hdr_type = hdr->b_type; + __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; __entry->hdr_size = hdr->b_size; __entry->hdr_spa = hdr->b_spa; - __entry->hdr_state_type = hdr->b_state->arcs_state; - __entry->hdr_access = hdr->b_arc_access; - __entry->hdr_mru_hits = hdr->b_mru_hits; - __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits; - __entry->hdr_mfu_hits = hdr->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - __entry->hdr_l2_hits = hdr->b_l2_hits; - __entry->hdr_refcount = hdr->b_refcnt.rc_count; + __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = hdr->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; @@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " - "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x datacnt %u size %llu spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, |