summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorPrakash Surya <[email protected]>2013-09-06 16:09:05 -0700
committerBrian Behlendorf <[email protected]>2013-10-25 13:57:25 -0700
commit1421c89142376bfd41e4de22ed7c7846b9e41f95 (patch)
treebc5b9d92faa233d5fd91a1a44cf3c2a173aba106 /module/zfs
parent76463d4026e0fa4b3d7b96acd58cb5fb79c49af7 (diff)
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls occurring on a system, in real time. To do this, a list was added to the in memory SPA data structure for a pool, with each element on the list corresponding to a call to arc_read. These entries are then exported through the kstat interface, which can then be interpreted in userspace. For each arc_read call, the following information is exported: * A unique identifier (uint64_t) * The time the entry was added to the list (hrtime_t) (*not* wall clock time; relative to the other entries on the list) * The objset ID (uint64_t) * The object number (uint64_t) * The indirection level (uint64_t) * The block ID (uint64_t) * The name of the function originating the arc_read call (char[24]) * The arc_flags from the arc_read call (uint32_t) * The PID of the reading thread (pid_t) * The command or name of thread originating read (char[16]) From this exported information one can see, in real time, exactly what is being read, what function is generating the read, and whether or not the read was found to be already cached. There is still some work to be done, but this should serve as a good starting point. Specifically, dbuf_read's are not accounted for in the currently exported information. Thus, a follow up patch should probably be added to export these calls that never call into arc_read (they only hit the dbuf hash table). In addition, it might be nice to create a utility similar to "arcstat.py" to digest the exported information and display it in a more readable format. Or perhaps, log the information and allow for it to be "replayed" at a later time. Signed-off-by: Prakash Surya <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/arc.c20
-rw-r--r--module/zfs/dmu_traverse.c7
-rw-r--r--module/zfs/spa_misc.c4
-rw-r--r--module/zfs/spa_stats.c248
5 files changed, 269 insertions, 11 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index e71228454..6f0f6ef05 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -44,6 +44,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 1298c5b91..366f5bf9b 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -2943,6 +2943,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
kmutex_t *hash_lock;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
+ int rc = 0;
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@@ -2976,10 +2977,10 @@ top:
hdr->b_acb = acb;
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
- return (0);
+ goto out;
}
mutex_exit(hash_lock);
- return (0);
+ goto out;
}
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -3174,12 +3175,12 @@ top:
if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio);
- return (0);
+ goto out;
}
ASSERT(*arc_flags & ARC_WAIT);
if (zio_wait(rzio) == 0)
- return (0);
+ goto out;
/* l2arc read error; goto zio_read() */
} else {
@@ -3203,13 +3204,18 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
- if (*arc_flags & ARC_WAIT)
- return (zio_wait(rzio));
+ if (*arc_flags & ARC_WAIT) {
+ rc = zio_wait(rzio);
+ goto out;
+ }
ASSERT(*arc_flags & ARC_NOWAIT);
zio_nowait(rzio);
}
- return (0);
+
+out:
+ spa_read_history_add(spa, zb, *arc_flags);
+ return (rc);
}
arc_prune_t *
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 32b3e50fc..1827ffc74 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
+ SET_BOOKMARK(czb, td->td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
uint32_t flags = ARC_WAIT;
@@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
err = arc_read(NULL, td->td_spa, rootbp,
arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
if (err != 0)
return (err);
@@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td, TQ_NOQUEUE))
pd->pd_exited = B_TRUE;
- SET_BOOKMARK(czb, td->td_objset,
- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(td, NULL, rootbp, czb);
mutex_enter(&pd->pd_mtx);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9ba6e8963..a0b4797c9 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -48,6 +48,7 @@
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/kstat.h>
#include "zfs_prop.h"
#include "zfeature_common.h"
@@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL;
*/
int zfs_deadman_enabled = 1;
-
/*
* ==========================================================================
* SPA config locking
@@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
+ spa_stats_init(spa);
avl_add(&spa_namespace_avl, spa);
@@ -580,6 +581,7 @@ spa_remove(spa_t *spa)
refcount_destroy(&spa->spa_refcount);
+ spa_stats_destroy(spa);
spa_config_lock_destroy(spa);
for (t = 0; t < TXG_SIZE; t++)
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
new file mode 100644
index 000000000..271a3c312
--- /dev/null
+++ b/module/zfs/spa_stats.c
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+ uint64_t uid; /* unique identifier */
+ hrtime_t start; /* time read completed */
+ uint64_t objset; /* read from this objset */
+ uint64_t object; /* read of this object number */
+ uint64_t level; /* block's indirection level */
+ uint64_t blkid; /* read of this block id */
+ char origin[24]; /* read originated from here */
+ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
+ pid_t pid; /* PID of task doing read */
+ char comm[16]; /* process name of task doing read */
+ list_node_t srh_link;
+} spa_read_history_t;
+
+static int
+spa_read_history_headers(char *buf, size_t size)
+{
+ size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+ "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+ "level", "blkid", "aflags", "origin", "pid", "process");
+ buf[size] = '\0';
+
+ return (0);
+}
+
+static int
+spa_read_history_data(char *buf, size_t size, void *data)
+{
+ spa_read_history_t *srh = (spa_read_history_t *)data;
+
+ size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx "
+ "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+ (u_longlong_t)srh->uid, srh->start,
+ (longlong_t)srh->objset, (longlong_t)srh->object,
+ (longlong_t)srh->level, (longlong_t)srh->blkid,
+ srh->aflags, srh->origin, srh->pid, srh->comm);
+ buf[size] = '\0';
+
+ return (0);
+}
+
+/*
+ * Calculate the address for the next spa_stats_history_t entry. The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static void *
+spa_read_history_addr(kstat_t *ksp, loff_t n)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+
+ ASSERT(MUTEX_HELD(&ssh->lock));
+
+ if (n == 0)
+ ssh->private = list_tail(&ssh->list);
+ else if (ssh->private)
+ ssh->private = list_prev(&ssh->list, ssh->private);
+
+ return (ssh->private);
+}
+
+/*
+ * When the kstat is written discard all spa_read_history_t entires. The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static int
+spa_read_history_update(kstat_t *ksp, int rw)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+
+ if (rw == KSTAT_WRITE) {
+ spa_read_history_t *srh;
+
+ while ((srh = list_remove_head(&ssh->list))) {
+ ssh->size--;
+ kmem_free(srh, sizeof(spa_read_history_t));
+ }
+
+ ASSERT3U(ssh->size, ==, 0);
+ }
+
+ ksp->ks_ndata = ssh->size;
+ ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t);
+
+ return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+ char name[KSTAT_STRLEN];
+ kstat_t *ksp;
+
+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&ssh->list, sizeof (spa_read_history_t),
+ offsetof(spa_read_history_t, srh_link));
+
+ ssh->count = 0;
+ ssh->size = 0;
+ ssh->private = NULL;
+
+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
+ name[KSTAT_STRLEN-1] = '\0';
+
+ ksp = kstat_create(name, 0, "reads", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ ssh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &ssh->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_read_history_update;
+ kstat_set_raw_ops(ksp, spa_read_history_headers,
+ spa_read_history_data, spa_read_history_addr);
+ kstat_install(ksp);
+ }
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+ spa_read_history_t *srh;
+ kstat_t *ksp;
+
+ ksp = ssh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_enter(&ssh->lock);
+ while ((srh = list_remove_head(&ssh->list))) {
+ ssh->size--;
+ kmem_free(srh, sizeof(spa_read_history_t));
+ }
+
+ ASSERT3U(ssh->size, ==, 0);
+ list_destroy(&ssh->list);
+ mutex_exit(&ssh->lock);
+
+ mutex_destroy(&ssh->lock);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+ spa_read_history_t *srh, *rm;
+
+ ASSERT3P(spa, !=, NULL);
+ ASSERT3P(zb, !=, NULL);
+
+ if (zfs_read_history == 0 && ssh->size == 0)
+ return;
+
+ if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED))
+ return;
+
+ srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE);
+ strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin));
+ strlcpy(srh->comm, getcomm(), sizeof(srh->comm));
+ srh->start = gethrtime();
+ srh->objset = zb->zb_objset;
+ srh->object = zb->zb_object;
+ srh->level = zb->zb_level;
+ srh->blkid = zb->zb_blkid;
+ srh->aflags = aflags;
+ srh->pid = getpid();
+
+ mutex_enter(&ssh->lock);
+
+ srh->uid = ssh->count++;
+ list_insert_head(&ssh->list, srh);
+ ssh->size++;
+
+ while (ssh->size > zfs_read_history) {
+ ssh->size--;
+ rm = list_remove_tail(&ssh->list);
+ kmem_free(rm, sizeof(spa_read_history_t));
+ }
+
+ mutex_exit(&ssh->lock);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+ spa_read_history_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+ spa_read_history_destroy(spa);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_read_history, int, 0644);
+MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads");
+
+module_param(zfs_read_history_hits, int, 0644);
+MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history");
+#endif