diff options
author | Brian Behlendorf <[email protected]> | 2013-10-02 17:11:19 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2013-10-25 13:59:40 -0700 |
commit | e0b0ca983d6897bcddf05af2c0e5d01ff66f90db (patch) | |
tree | d6d3251e89a67f1a915f1a071313271e061af359 /module/zfs | |
parent | 2d37239a28b8b2ddc0e8312093f8d8810c6351fa (diff) |
Add visibility in to cached dbufs
Currently there is no mechanism to inspect which dbufs are being
cached by the system. There are some coarse counters in arcstats
by they only give a rough idea of what's being cached. This patch
aims to improve the current situation by adding a new dbufs kstat.
When read this new kstat will walk all cached dbufs linked in to
the dbuf_hash. For each dbuf it will dump detailed information
about the buffer. It will also dump additional information about
the referenced arc buffer and its related dnode. This provides a
more complete view in to exactly what is being cached.
With this generic infrastructure in place utilities can be written
to post-process the data to understand exactly how the caching is
working. For example, the data could be processed to show a list
of all cached dnodes and how much space they're consuming. Or a
similar list could be generated based on dnode type. Many other
ways to interpret the data exist based on what kinds of questions
you're trying to answer.
Signed-off-by: Brian Behlendorf <[email protected]>
Signed-off-by: Prakash Surya <[email protected]>
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/Makefile.in | 1 | ||||
-rw-r--r-- | module/zfs/arc.c | 87 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 4 | ||||
-rw-r--r-- | module/zfs/dbuf_stats.c | 230 | ||||
-rw-r--r-- | module/zfs/dmu.c | 18 |
5 files changed, 332 insertions, 8 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6f0f6ef05..5552436ad 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o +$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 366f5bf9b..9098988fd 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -232,6 +232,7 @@ typedef struct arc_state { uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; + arc_state_type_t arcs_state; } arc_state_t; /* The 6 states: */ @@ -534,6 +535,11 @@ struct arc_buf_hdr { /* updated atomically */ clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; /* self protecting */ refcount_t b_refcnt; @@ -709,7 +715,8 @@ struct l2arc_buf_hdr { /* compression applied to buffer data */ enum zio_compress b_compress; /* real alloc'd buffer size depending on b_compress applied */ - int b_asize; + uint32_t b_asize; + uint32_t b_hits; /* temporary buffer holder for in-flight compressed data */ void *b_tmp_cdata; }; @@ -1138,6 +1145,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) } /* + * Returns detailed information about a specific arc buffer. When the + * state_index argument is set the function will calculate the arc header + * list position for its arc state. Since this requires a linear traversal + * callers are strongly encourage not to do this. However, it can be helpful + * for targeted analysis so the functionality is provided. + */ +void +arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) +{ + arc_buf_hdr_t *hdr = ab->b_hdr; + arc_state_t *state = hdr->b_state; + + memset(abi, 0, sizeof(arc_buf_info_t)); + abi->abi_flags = hdr->b_flags; + abi->abi_datacnt = hdr->b_datacnt; + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = hdr->b_type; + abi->abi_state_index = -1; + abi->abi_size = hdr->b_size; + abi->abi_access = hdr->b_arc_access; + abi->abi_mru_hits = hdr->b_mru_hits; + abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&hdr->b_refcnt); + + if (hdr->b_l2hdr) { + abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; + abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; + abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; + abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; + } + + if (state && state_index && list_link_active(&hdr->b_arc_node)) { + list_t *list = &state->arcs_list[hdr->b_type]; + arc_buf_hdr_t *h; + + mutex_enter(&state->arcs_mtx); + for (h = list_head(list); h != NULL; h = list_next(list, h)) { + abi->abi_state_index++; + if (h == hdr) + break; + } + mutex_exit(&state->arcs_mtx); + } +} + +/* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. */ @@ -1298,6 +1353,11 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -2670,6 +2730,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } else { buf->b_flags &= ~ARC_PREFETCH; + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } buf->b_arc_access = now; @@ -2691,6 +2752,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } else if (buf->b_state == arc_mru_ghost) { arc_state_t *new_state; @@ -2713,6 +2775,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (buf->b_state == arc_mfu) { /* @@ -2728,6 +2791,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); } + atomic_inc_32(&buf->b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { @@ -2751,6 +2815,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (buf->b_state == arc_l2c_only) { /* @@ -3134,6 +3199,7 @@ top: DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); + atomic_inc_32(&hdr->b_l2hdr->b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_PUSHPAGE); @@ -3469,6 +3535,11 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; + nhdr->b_mru_hits = 0; + nhdr->b_mru_ghost_hits = 0; + nhdr->b_mfu_hits = 0; + nhdr->b_mfu_ghost_hits = 0; + nhdr->b_l2_hits = 0; nhdr->b_flags = flags & ARC_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; @@ -3485,6 +3556,11 @@ arc_release(arc_buf_t *buf, void *tag) if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; if (hash_lock) mutex_exit(hash_lock); @@ -3902,6 +3978,13 @@ arc_init(void) list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + buf_init(); arc_thread_exit = 0; @@ -4785,6 +4868,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, l2hdr->b_compress = ZIO_COMPRESS_OFF; l2hdr->b_asize = ab->b_size; l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_hits = 0; buf_sz = ab->b_size; ab->b_l2hdr = l2hdr; @@ -5317,6 +5401,7 @@ l2arc_stop(void) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_remove_ref); +EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d655d6621..44e9419b7 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -317,6 +317,8 @@ retry: for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dbuf_stats_init(h); } void @@ -325,6 +327,8 @@ dbuf_fini(void) dbuf_hash_table_t *h = &dbuf_hash_table; int i; + dbuf_stats_destroy(); + for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c new file mode 100644 index 000000000..ef760eaba --- /dev/null +++ b/module/zfs/dbuf_stats.c @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> + +/* + * Calculate the index of the arc header for the state, disabled by default. + */ +int zfs_dbuf_state_index = 0; + +/* + * ========================================================================== + * Dbuf Hash Read Routines + * ========================================================================== + */ +typedef struct dbuf_stats_t { + kmutex_t lock; + kstat_t *kstat; + dbuf_hash_table_t *hash; + int idx; +} dbuf_stats_t; + +static dbuf_stats_t dbuf_stats_hash_table; + +static int +dbuf_stats_hash_table_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, + "%-88s | %-124s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " + "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " + "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " + "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", + "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", + "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", + "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", + "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", + "dtype", "btype", "data_bs", "meta_bs", "bsize", + "lvls", "dholds", "blocks", "dsize"); + buf[size] = '\0'; + + return (0); +} + +int +__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) +{ + arc_buf_info_t abi = { 0 }; + dmu_object_info_t doi = { 0 }; + dnode_t *dn = DB_DNODE(db); + + if (db->db_buf) + arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); + + if (dn) + __dmu_object_info_from_dnode(dn, &doi); + + size = snprintf(buf, size - 1, + "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " + "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " + "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " + "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", + /* dmu_buf_impl_t */ + spa_name(dn->dn_objset->os_spa), + (u_longlong_t)dmu_objset_id(db->db_objset), + (longlong_t)db->db.db_object, + (longlong_t)db->db_level, + (longlong_t)db->db_blkid, + (u_longlong_t)db->db.db_offset, + (u_longlong_t)db->db.db_size, + !!dbuf_is_metadata(db), + db->db_state, + (ulong_t)refcount_count(&db->db_holds), + /* arc_buf_info_t */ + abi.abi_state_type, + abi.abi_state_contents, + (longlong_t)abi.abi_state_index, + abi.abi_flags, + (ulong_t)abi.abi_datacnt, + (u_longlong_t)abi.abi_size, + (u_longlong_t)abi.abi_access, + (ulong_t)abi.abi_mru_hits, + (ulong_t)abi.abi_mru_ghost_hits, + (ulong_t)abi.abi_mfu_hits, + (ulong_t)abi.abi_mfu_ghost_hits, + (ulong_t)abi.abi_l2arc_hits, + (u_longlong_t)abi.abi_l2arc_dattr, + (u_longlong_t)abi.abi_l2arc_asize, + abi.abi_l2arc_compress, + (ulong_t)abi.abi_holds, + /* dmu_object_info_t */ + doi.doi_type, + doi.doi_bonus_type, + (ulong_t)doi.doi_data_block_size, + (ulong_t)doi.doi_metadata_block_size, + (u_longlong_t)doi.doi_bonus_size, + (ulong_t)doi.doi_indirection, + (ulong_t)refcount_count(&dn->dn_holds), + (u_longlong_t)doi.doi_fill_count, + (u_longlong_t)doi.doi_max_offset); + buf[size] = '\0'; + + return (size); +} + +static int +dbuf_stats_hash_table_data(char *buf, size_t size, void *data) +{ + dbuf_stats_t *dsh = (dbuf_stats_t *)data; + dbuf_hash_table_t *h = dsh->hash; + dmu_buf_impl_t *db; + int length, error = 0; + + ASSERT3S(dsh->idx, >=, 0); + ASSERT3S(dsh->idx, <=, h->hash_table_mask); + memset(buf, 0, size); + + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { + /* + * Returning ENOMEM will cause the data and header functions + * to be called with a larger scratch buffers. + */ + if (size < 512) { + error = ENOMEM; + break; + } + + mutex_enter(&db->db_mtx); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + length = __dbuf_stats_hash_table_data(buf, size, db); + buf += length; + size -= length; + + mutex_exit(&db->db_mtx); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + } + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + return (error); +} + +static void * +dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) +{ + dbuf_stats_t *dsh = ksp->ks_private; + + ASSERT(MUTEX_HELD(&dsh->lock)); + + if (n <= dsh->hash->hash_table_mask) { + dsh->idx = n; + return (dsh); + } + + return (NULL); +} + +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); + dsh->hash = hash; + + ksp = kstat_create("zfs", 0, "dbufs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + dsh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &dsh->lock; + ksp->ks_ndata = UINT32_MAX; + ksp->ks_private = dsh; + kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, + dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); + kstat_install(ksp); + } +} + +static void +dbuf_stats_hash_table_destroy(void) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + ksp = dsh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&dsh->lock); +} + +void +dbuf_stats_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_hash_table_init(hash); +} + +void +dbuf_stats_destroy(void) +{ + dbuf_stats_hash_table_destroy(); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_dbuf_state_index, int, 0644); +MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); +#endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cbf4790b1..4ec9cb46a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1815,16 +1815,11 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) } void -dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { - dnode_phys_t *dnp; + dnode_phys_t *dnp = dn->dn_phys; int i; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - mutex_enter(&dn->dn_mtx); - - dnp = dn->dn_phys; - doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; @@ -1839,6 +1834,15 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_fill_count = 0; for (i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); |