summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2012-11-06 14:44:07 -0800
committerBrian Behlendorf <[email protected]>2012-11-06 14:48:06 -0800
commitbbf8c748051294b104678f101ba1162632099b07 (patch)
tree4752e52379492cd45bdb5cd930496723dabed269
parente8fd45a0f975c6b8ae8cd644714fc21f14fac2bf (diff)
parent9dcb97198338ba2d8764dd5604b278118612f74d (diff)
Merge remote branch 'eris/stats'
Bring in support for the new KSTAT_TYPE_TXG type. This allows for additional visibility in to the txg handling. Signed-off-by: Brian Behlendorf <[email protected]>
-rw-r--r--include/sys/dsl_pool.h15
-rw-r--r--include/sys/fm/fs/zfs.h2
-rw-r--r--lib/libspl/include/sys/kstat.h27
-rw-r--r--module/zfs/dsl_pool.c144
-rw-r--r--module/zfs/txg.c46
-rw-r--r--module/zfs/zfs_fm.c4
-rw-r--r--module/zfs/zio.c32
7 files changed, 259 insertions, 11 deletions
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 99a131eec..40e96101d 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -65,6 +65,13 @@ typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
} zfs_all_blkstats_t;
+typedef struct txg_history {
+ kstat_txg_t th_kstat;
+ vdev_stat_t th_vs1;
+ vdev_stat_t th_vs2;
+ kmutex_t th_lock;
+ list_node_t th_link;
+} txg_history_t;
typedef struct dsl_pool {
/* Immutable */
@@ -76,6 +83,7 @@ typedef struct dsl_pool {
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
+ kstat_t *dp_txg_kstat;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
@@ -92,6 +100,9 @@ typedef struct dsl_pool {
kmutex_t dp_lock;
uint64_t dp_space_towrite[TXG_SIZE];
uint64_t dp_tempreserved[TXG_SIZE];
+ uint64_t dp_txg_history_size;
+ list_t dp_txg_history;
+
/* Has its own locking */
tx_state_t dp_tx;
@@ -144,6 +155,10 @@ extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg);
+txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_txg_history_put(txg_history_t *th);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index 6e652df98..15803c034 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -83,6 +83,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h
index fcd3ed98b..6bd2ec878 100644
--- a/lib/libspl/include/sys/kstat.h
+++ b/lib/libspl/include/sys/kstat.h
@@ -228,8 +228,10 @@ typedef struct kstat32 {
/* ks_ndata == 1 */
#define KSTAT_TYPE_TIMER 4 /* event timer */
/* ks_ndata >= 1 */
+#define KSTAT_TYPE_TXG 5 /* txg statistics */
+ /* ks_ndata >= 0 */
-#define KSTAT_NUM_TYPES 5
+#define KSTAT_NUM_TYPES 6
/*
* kstat class
@@ -698,6 +700,29 @@ typedef struct kstat_timer {
#define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data)
+/*
+ * TXG statistics - bytes read/written and iops performed
+ */
+typedef enum kstat_txg_state {
+ TXG_STATE_OPEN = 1,
+ TXG_STATE_QUIESCING = 2,
+ TXG_STATE_SYNCING = 3,
+ TXG_STATE_COMMITTED = 4,
+} kstat_txg_state_t;
+
+typedef struct kstat_txg {
+ u_longlong_t txg; /* txg id */
+ kstat_txg_state_t state; /* txg state */
+ hrtime_t birth; /* birth time stamp */
+ u_longlong_t nread; /* number of bytes read */
+ u_longlong_t nwritten; /* number of bytes written */
+ uint_t reads; /* number of read operations */
+ uint_t writes; /* number of write operations */
+ hrtime_t open_time; /* open time */
+ hrtime_t quiesce_time; /* quiesce time */
+ hrtime_t sync_time; /* sync time */
+} kstat_txg_t;
+
#if defined(_KERNEL)
#include <sys/t_lock.h>
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 53419778a..7e0fba589 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -44,6 +44,7 @@
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
+int zfs_txg_history = 60; /* statistics for the last N txgs */
unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
unsigned long zfs_write_limit_max = 0; /* max data payload per txg */
@@ -54,6 +55,143 @@ kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
+static int
+dsl_pool_txg_history_update(kstat_t *ksp, int rw)
+{
+ dsl_pool_t *dp = ksp->ks_private;
+ txg_history_t *th;
+ int i = 0;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ if (ksp->ks_data)
+ kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+ mutex_enter(&dp->dp_lock);
+
+ ksp->ks_ndata = dp->dp_txg_history_size;
+ ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t);
+ if (ksp->ks_data_size > 0)
+ ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE);
+
+ /* Traversed oldest to youngest for the most readable kstat output */
+ for (th = list_tail(&dp->dp_txg_history); th != NULL;
+ th = list_prev(&dp->dp_txg_history, th)) {
+ mutex_enter(&th->th_lock);
+ ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size);
+ memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t));
+ i += sizeof(kstat_txg_t);
+ mutex_exit(&th->th_lock);
+ }
+
+ mutex_exit(&dp->dp_lock);
+
+ return (0);
+}
+
+static void
+dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg)
+{
+ char name[KSTAT_STRLEN];
+
+ list_create(&dp->dp_txg_history, sizeof (txg_history_t),
+ offsetof(txg_history_t, th_link));
+ dsl_pool_txg_history_add(dp, txg);
+
+ (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa));
+ dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc",
+ KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL);
+ if (dp->dp_txg_kstat) {
+ dp->dp_txg_kstat->ks_data = NULL;
+ dp->dp_txg_kstat->ks_private = dp;
+ dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update;
+ kstat_install(dp->dp_txg_kstat);
+ }
+}
+
+static void
+dsl_pool_txg_history_destroy(dsl_pool_t *dp)
+{
+ txg_history_t *th;
+
+ if (dp->dp_txg_kstat) {
+ if (dp->dp_txg_kstat->ks_data)
+ kmem_free(dp->dp_txg_kstat->ks_data,
+ dp->dp_txg_kstat->ks_data_size);
+
+ kstat_delete(dp->dp_txg_kstat);
+ }
+
+ mutex_enter(&dp->dp_lock);
+ while ((th = list_remove_head(&dp->dp_txg_history))) {
+ dp->dp_txg_history_size--;
+ mutex_destroy(&th->th_lock);
+ kmem_free(th, sizeof(txg_history_t));
+ }
+
+ ASSERT3U(dp->dp_txg_history_size, ==, 0);
+ list_destroy(&dp->dp_txg_history);
+ mutex_exit(&dp->dp_lock);
+}
+
+txg_history_t *
+dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg)
+{
+ txg_history_t *th, *rm;
+
+ th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP);
+ mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL);
+ th->th_kstat.txg = txg;
+ th->th_kstat.state = TXG_STATE_OPEN;
+ th->th_kstat.birth = gethrtime();
+
+ mutex_enter(&dp->dp_lock);
+
+ list_insert_head(&dp->dp_txg_history, th);
+ dp->dp_txg_history_size++;
+
+ while (dp->dp_txg_history_size > zfs_txg_history) {
+ dp->dp_txg_history_size--;
+ rm = list_remove_tail(&dp->dp_txg_history);
+ mutex_destroy(&rm->th_lock);
+ kmem_free(rm, sizeof(txg_history_t));
+ }
+
+ mutex_exit(&dp->dp_lock);
+
+ return (th);
+}
+
+/*
+ * Traversed youngest to oldest because lookups are only done for open
+ * or syncing txgs which are guaranteed to be at the head of the list.
+ * The txg_history_t structure will be returned locked.
+ */
+txg_history_t *
+dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg)
+{
+ txg_history_t *th;
+
+ mutex_enter(&dp->dp_lock);
+ for (th = list_head(&dp->dp_txg_history); th != NULL;
+ th = list_next(&dp->dp_txg_history, th)) {
+ if (th->th_kstat.txg == txg) {
+ mutex_enter(&th->th_lock);
+ break;
+ }
+ }
+ mutex_exit(&dp->dp_lock);
+
+ return (th);
+}
+
+void
+dsl_pool_txg_history_put(txg_history_t *th)
+{
+ mutex_exit(&th->th_lock);
+}
+
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
@@ -96,6 +234,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
1, 4, 0);
+ dsl_pool_txg_history_init(dp, txg);
+
return (dp);
}
@@ -213,6 +353,7 @@ dsl_pool_close(dsl_pool_t *dp)
arc_flush(dp->dp_spa);
txg_fini(dp);
dsl_scan_fini(dp);
+ dsl_pool_txg_history_destroy(dp);
rw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
taskq_destroy(dp->dp_iput_taskq);
@@ -864,6 +1005,9 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
module_param(zfs_txg_synctime_ms, int, 0644);
MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");
+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
+
module_param(zfs_write_limit_min, ulong, 0444);
MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 5702a61f0..17494bcda 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -29,6 +29,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/callb.h>
+#include <sys/spa_impl.h>
/*
* Pool-wide transaction groups.
@@ -279,6 +280,8 @@ txg_rele_to_sync(txg_handle_t *th)
static void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
+ hrtime_t start;
+ txg_history_t *th;
tx_state_t *tx = &dp->dp_tx;
int g = txg & TXG_MASK;
int c;
@@ -293,6 +296,15 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
tx->tx_open_txg++;
/*
+ * Measure how long the txg was open and replace the kstat.
+ */
+ th = dsl_pool_txg_history_get(dp, txg);
+ th->th_kstat.open_time = gethrtime() - th->th_kstat.birth;
+ th->th_kstat.state = TXG_STATE_QUIESCING;
+ dsl_pool_txg_history_put(th);
+ dsl_pool_txg_history_add(dp, tx->tx_open_txg);
+
+ /*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
*/
@@ -302,6 +314,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
/*
* Quiesce the transaction group by waiting for everyone to txg_exit().
*/
+ start = gethrtime();
+
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
mutex_enter(&tc->tc_lock);
@@ -309,6 +323,13 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
cv_wait(&tc->tc_cv[g], &tc->tc_lock);
mutex_exit(&tc->tc_lock);
}
+
+ /*
+ * Measure how long the txg took to quiesce.
+ */
+ th = dsl_pool_txg_history_get(dp, txg);
+ th->th_kstat.quiesce_time = gethrtime() - start;
+ dsl_pool_txg_history_put(th);
}
static void
@@ -395,6 +416,8 @@ txg_sync_thread(dsl_pool_t *dp)
start = delta = 0;
for (;;) {
+ hrtime_t hrstart;
+ txg_history_t *th;
uint64_t timer, timeout;
uint64_t txg;
@@ -441,11 +464,17 @@ txg_sync_thread(dsl_pool_t *dp)
tx->tx_syncing_txg = txg;
cv_broadcast(&tx->tx_quiesce_more_cv);
+ th = dsl_pool_txg_history_get(dp, txg);
+ th->th_kstat.state = TXG_STATE_SYNCING;
+ vdev_get_stats(spa->spa_root_vdev, &th->th_vs1);
+ dsl_pool_txg_history_put(th);
+
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
start = ddi_get_lbolt();
+ hrstart = gethrtime();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
@@ -458,6 +487,23 @@ txg_sync_thread(dsl_pool_t *dp)
* Dispatch commit callbacks to worker threads.
*/
txg_dispatch_callbacks(dp, txg);
+
+ /*
+ * Measure the txg sync time determine the amount of I/O done.
+ */
+ th = dsl_pool_txg_history_get(dp, txg);
+ vdev_get_stats(spa->spa_root_vdev, &th->th_vs2);
+ th->th_kstat.sync_time = gethrtime() - hrstart;
+ th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] -
+ th->th_vs1.vs_bytes[ZIO_TYPE_READ];
+ th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] -
+ th->th_vs1.vs_bytes[ZIO_TYPE_WRITE];
+ th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] -
+ th->th_vs1.vs_ops[ZIO_TYPE_READ];
+ th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] -
+ th->th_vs1.vs_ops[ZIO_TYPE_WRITE];
+ th->th_kstat.state = TXG_STATE_COMMITTED;
+ dsl_pool_txg_history_put(th);
}
}
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 0b98231ec..db6a831d2 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -294,6 +294,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_INT32, zio->io_error, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
DATA_TYPE_INT32, zio->io_flags, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
+ DATA_TYPE_UINT32, zio->io_stage, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
+ DATA_TYPE_UINT32, zio->io_pipeline, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
DATA_TYPE_UINT64, zio->io_delay, NULL);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index ece3329d0..bcfc081d1 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1305,18 +1305,34 @@ __zio_execute(zio_t *zio)
int
zio_wait(zio_t *zio)
{
+ uint64_t timeout;
int error;
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_executor == NULL);
zio->io_waiter = curthread;
+ timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz);
__zio_execute(zio);
mutex_enter(&zio->io_lock);
- while (zio->io_executor != NULL)
- cv_wait(&zio->io_cv, &zio->io_lock);
+ while (zio->io_executor != NULL) {
+ /*
+ * Wake up periodically to prevent the kernel from complaining
+ * about a blocked task. However, check zio_delay_max to see
+ * if the I/O has exceeded the timeout and post an ereport.
+ */
+ cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock,
+ ddi_get_lbolt() + hz);
+
+ if (timeout && (ddi_get_lbolt() > timeout)) {
+ zio->io_delay = zio_delay_max;
+ zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, zio, 0, 0);
+ timeout = 0;
+ }
+ }
mutex_exit(&zio->io_lock);
error = zio->io_error;
@@ -2889,15 +2905,11 @@ zio_done(zio_t *zio)
vdev_stat_update(zio, zio->io_size);
/*
- * If this I/O is attached to a particular vdev is slow, exeeding
- * 30 seconds to complete, post an error described the I/O delay.
- * We ignore these errors if the device is currently unavailable.
+ * When an I/O completes but was slow post an ereport.
*/
- if (zio->io_delay >= zio_delay_max) {
- if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
- zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
- zio->io_vd, zio, 0, 0);
- }
+ if (zio->io_delay >= zio_delay_max)
+ zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
+ zio->io_vd, zio, 0, 0);
if (zio->io_error) {
/*