From e0b0ca983d6897bcddf05af2c0e5d01ff66f90db Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 2 Oct 2013 17:11:19 -0700 Subject: Add visibility in to cached dbufs Currently there is no mechanism to inspect which dbufs are being cached by the system. There are some coarse counters in arcstats by they only give a rough idea of what's being cached. This patch aims to improve the current situation by adding a new dbufs kstat. When read this new kstat will walk all cached dbufs linked in to the dbuf_hash. For each dbuf it will dump detailed information about the buffer. It will also dump additional information about the referenced arc buffer and its related dnode. This provides a more complete view in to exactly what is being cached. With this generic infrastructure in place utilities can be written to post-process the data to understand exactly how the caching is working. For example, the data could be processed to show a list of all cached dnodes and how much space they're consuming. Or a similar list could be generated based on dnode type. Many other ways to interpret the data exist based on what kinds of questions you're trying to answer. Signed-off-by: Brian Behlendorf Signed-off-by: Prakash Surya --- module/zfs/Makefile.in | 1 + module/zfs/arc.c | 87 +++++++++++++++++- module/zfs/dbuf.c | 4 + module/zfs/dbuf_stats.c | 230 ++++++++++++++++++++++++++++++++++++++++++++++++ module/zfs/dmu.c | 18 ++-- 5 files changed, 332 insertions(+), 8 deletions(-) create mode 100644 module/zfs/dbuf_stats.c (limited to 'module/zfs') diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6f0f6ef05..5552436ad 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o +$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 366f5bf9b..9098988fd 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -232,6 +232,7 @@ typedef struct arc_state { uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; + arc_state_type_t arcs_state; } arc_state_t; /* The 6 states: */ @@ -534,6 +535,11 @@ struct arc_buf_hdr { /* updated atomically */ clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; /* self protecting */ refcount_t b_refcnt; @@ -709,7 +715,8 @@ struct l2arc_buf_hdr { /* compression applied to buffer data */ enum zio_compress b_compress; /* real alloc'd buffer size depending on b_compress applied */ - int b_asize; + uint32_t b_asize; + uint32_t b_hits; /* temporary buffer holder for in-flight compressed data */ void *b_tmp_cdata; }; @@ -1137,6 +1144,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) return (cnt); } +/* + * Returns detailed information about a specific arc buffer. When the + * state_index argument is set the function will calculate the arc header + * list position for its arc state. Since this requires a linear traversal + * callers are strongly encourage not to do this. However, it can be helpful + * for targeted analysis so the functionality is provided. + */ +void +arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) +{ + arc_buf_hdr_t *hdr = ab->b_hdr; + arc_state_t *state = hdr->b_state; + + memset(abi, 0, sizeof(arc_buf_info_t)); + abi->abi_flags = hdr->b_flags; + abi->abi_datacnt = hdr->b_datacnt; + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = hdr->b_type; + abi->abi_state_index = -1; + abi->abi_size = hdr->b_size; + abi->abi_access = hdr->b_arc_access; + abi->abi_mru_hits = hdr->b_mru_hits; + abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&hdr->b_refcnt); + + if (hdr->b_l2hdr) { + abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; + abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; + abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; + abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; + } + + if (state && state_index && list_link_active(&hdr->b_arc_node)) { + list_t *list = &state->arcs_list[hdr->b_type]; + arc_buf_hdr_t *h; + + mutex_enter(&state->arcs_mtx); + for (h = list_head(list); h != NULL; h = list_next(list, h)) { + abi->abi_state_index++; + if (h == hdr) + break; + } + mutex_exit(&state->arcs_mtx); + } +} + /* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. @@ -1298,6 +1353,11 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -2670,6 +2730,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } else { buf->b_flags &= ~ARC_PREFETCH; + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } buf->b_arc_access = now; @@ -2691,6 +2752,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } else if (buf->b_state == arc_mru_ghost) { arc_state_t *new_state; @@ -2713,6 +2775,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (buf->b_state == arc_mfu) { /* @@ -2728,6 +2791,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); } + atomic_inc_32(&buf->b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { @@ -2751,6 +2815,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (buf->b_state == arc_l2c_only) { /* @@ -3134,6 +3199,7 @@ top: DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); + atomic_inc_32(&hdr->b_l2hdr->b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_PUSHPAGE); @@ -3469,6 +3535,11 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; + nhdr->b_mru_hits = 0; + nhdr->b_mru_ghost_hits = 0; + nhdr->b_mfu_hits = 0; + nhdr->b_mfu_ghost_hits = 0; + nhdr->b_l2_hits = 0; nhdr->b_flags = flags & ARC_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; @@ -3485,6 +3556,11 @@ arc_release(arc_buf_t *buf, void *tag) if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; if (hash_lock) mutex_exit(hash_lock); @@ -3902,6 +3978,13 @@ arc_init(void) list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + buf_init(); arc_thread_exit = 0; @@ -4785,6 +4868,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, l2hdr->b_compress = ZIO_COMPRESS_OFF; l2hdr->b_asize = ab->b_size; l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_hits = 0; buf_sz = ab->b_size; ab->b_l2hdr = l2hdr; @@ -5317,6 +5401,7 @@ l2arc_stop(void) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_remove_ref); +EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d655d6621..44e9419b7 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -317,6 +317,8 @@ retry: for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dbuf_stats_init(h); } void @@ -325,6 +327,8 @@ dbuf_fini(void) dbuf_hash_table_t *h = &dbuf_hash_table; int i; + dbuf_stats_destroy(); + for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c new file mode 100644 index 000000000..ef760eaba --- /dev/null +++ b/module/zfs/dbuf_stats.c @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +/* + * Calculate the index of the arc header for the state, disabled by default. + */ +int zfs_dbuf_state_index = 0; + +/* + * ========================================================================== + * Dbuf Hash Read Routines + * ========================================================================== + */ +typedef struct dbuf_stats_t { + kmutex_t lock; + kstat_t *kstat; + dbuf_hash_table_t *hash; + int idx; +} dbuf_stats_t; + +static dbuf_stats_t dbuf_stats_hash_table; + +static int +dbuf_stats_hash_table_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, + "%-88s | %-124s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " + "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " + "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " + "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", + "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", + "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", + "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", + "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", + "dtype", "btype", "data_bs", "meta_bs", "bsize", + "lvls", "dholds", "blocks", "dsize"); + buf[size] = '\0'; + + return (0); +} + +int +__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) +{ + arc_buf_info_t abi = { 0 }; + dmu_object_info_t doi = { 0 }; + dnode_t *dn = DB_DNODE(db); + + if (db->db_buf) + arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); + + if (dn) + __dmu_object_info_from_dnode(dn, &doi); + + size = snprintf(buf, size - 1, + "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " + "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " + "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " + "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", + /* dmu_buf_impl_t */ + spa_name(dn->dn_objset->os_spa), + (u_longlong_t)dmu_objset_id(db->db_objset), + (longlong_t)db->db.db_object, + (longlong_t)db->db_level, + (longlong_t)db->db_blkid, + (u_longlong_t)db->db.db_offset, + (u_longlong_t)db->db.db_size, + !!dbuf_is_metadata(db), + db->db_state, + (ulong_t)refcount_count(&db->db_holds), + /* arc_buf_info_t */ + abi.abi_state_type, + abi.abi_state_contents, + (longlong_t)abi.abi_state_index, + abi.abi_flags, + (ulong_t)abi.abi_datacnt, + (u_longlong_t)abi.abi_size, + (u_longlong_t)abi.abi_access, + (ulong_t)abi.abi_mru_hits, + (ulong_t)abi.abi_mru_ghost_hits, + (ulong_t)abi.abi_mfu_hits, + (ulong_t)abi.abi_mfu_ghost_hits, + (ulong_t)abi.abi_l2arc_hits, + (u_longlong_t)abi.abi_l2arc_dattr, + (u_longlong_t)abi.abi_l2arc_asize, + abi.abi_l2arc_compress, + (ulong_t)abi.abi_holds, + /* dmu_object_info_t */ + doi.doi_type, + doi.doi_bonus_type, + (ulong_t)doi.doi_data_block_size, + (ulong_t)doi.doi_metadata_block_size, + (u_longlong_t)doi.doi_bonus_size, + (ulong_t)doi.doi_indirection, + (ulong_t)refcount_count(&dn->dn_holds), + (u_longlong_t)doi.doi_fill_count, + (u_longlong_t)doi.doi_max_offset); + buf[size] = '\0'; + + return (size); +} + +static int +dbuf_stats_hash_table_data(char *buf, size_t size, void *data) +{ + dbuf_stats_t *dsh = (dbuf_stats_t *)data; + dbuf_hash_table_t *h = dsh->hash; + dmu_buf_impl_t *db; + int length, error = 0; + + ASSERT3S(dsh->idx, >=, 0); + ASSERT3S(dsh->idx, <=, h->hash_table_mask); + memset(buf, 0, size); + + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { + /* + * Returning ENOMEM will cause the data and header functions + * to be called with a larger scratch buffers. + */ + if (size < 512) { + error = ENOMEM; + break; + } + + mutex_enter(&db->db_mtx); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + length = __dbuf_stats_hash_table_data(buf, size, db); + buf += length; + size -= length; + + mutex_exit(&db->db_mtx); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + } + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + return (error); +} + +static void * +dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) +{ + dbuf_stats_t *dsh = ksp->ks_private; + + ASSERT(MUTEX_HELD(&dsh->lock)); + + if (n <= dsh->hash->hash_table_mask) { + dsh->idx = n; + return (dsh); + } + + return (NULL); +} + +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); + dsh->hash = hash; + + ksp = kstat_create("zfs", 0, "dbufs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + dsh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &dsh->lock; + ksp->ks_ndata = UINT32_MAX; + ksp->ks_private = dsh; + kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, + dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); + kstat_install(ksp); + } +} + +static void +dbuf_stats_hash_table_destroy(void) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + ksp = dsh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&dsh->lock); +} + +void +dbuf_stats_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_hash_table_init(hash); +} + +void +dbuf_stats_destroy(void) +{ + dbuf_stats_hash_table_destroy(); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_dbuf_state_index, int, 0644); +MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); +#endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cbf4790b1..4ec9cb46a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1815,16 +1815,11 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) } void -dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { - dnode_phys_t *dnp; + dnode_phys_t *dnp = dn->dn_phys; int i; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - mutex_enter(&dn->dn_mtx); - - dnp = dn->dn_phys; - doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; @@ -1839,6 +1834,15 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_fill_count = 0; for (i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); -- cgit v1.2.3