From 0b1401ee911c5a0c0bdb7a8e6ad36840cea3af24 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 1 Oct 2013 09:50:50 -0700 Subject: Add visibility in to txg sync behavior This change is an attempt to add visibility in to how txgs are being formed on a system, in real time. To do this, a list was added to the in memory SPA data structure for a pool, with each element on the list corresponding to txg. These entries are then exported through the kstat interface, which can then be interpreted in userspace. For each txg, the following information is exported: * Unique txg number (uint64_t) * The time the txd was born (hrtime_t) (*not* wall clock time; relative to the other entries on the list) * The current txg state ((O)pen/(Q)uiescing/(S)yncing/(C)ommitted) * The number of reserved bytes for the txg (uint64_t) * The number of bytes read during the txg (uint64_t) * The number of bytes written during the txg (uint64_t) * The number of read operations during the txg (uint64_t) * The number of write operations during the txg (uint64_t) * The time the txg was closed (hrtime_t) * The time the txg was quiesced (hrtime_t) * The time the txg was synced (hrtime_t) Note that while the raw kstat now stores relative hrtimes for the open, quiesce, and sync times. Those relative times are used to calculate how long each state took and these deltas and printed by output handlers. Signed-off-by: Brian Behlendorf --- include/sys/spa.h | 14 +++ module/zfs/spa_stats.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++++ module/zfs/txg.c | 27 ++++- 3 files changed, 322 insertions(+), 1 deletion(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index a4a717ac0..85a62588a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -548,12 +548,26 @@ typedef struct spa_stats_history { typedef struct spa_stats { spa_stats_history_t read_history; + spa_stats_history_t txg_history; } spa_stats_t; +typedef enum txg_state { + TXG_STATE_BIRTH = 0, + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCED = 2, + TXG_STATE_SYNCED = 3, + TXG_STATE_COMMITTED = 4, +} txg_state_t; + extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags); +extern void spa_txg_history_add(spa_t *spa, uint64_t txg); +extern int spa_txg_history_set(spa_t *spa, uint64_t txg, + txg_state_t completed_state, hrtime_t completed_time); +extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 271a3c312..16ccbb0ad 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -32,6 +32,11 @@ int zfs_read_history = 0; */ int zfs_read_history_hits = 0; +/* + * Keeps stats on the last N txgs, disabled by default. + */ +int zfs_txg_history = 0; + /* * ========================================================================== * SPA Read History Routines @@ -227,15 +232,289 @@ spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) mutex_exit(&ssh->lock); } +/* + * ========================================================================== + * SPA TXG History Routines + * ========================================================================== + */ + +/* + * Txg statistics - Information exported regarding each txg sync + */ + +typedef struct spa_txg_history { + uint64_t txg; /* txg id */ + txg_state_t state; /* active txg state */ + uint64_t nread; /* number of bytes read */ + uint64_t nwritten; /* number of bytes written */ + uint64_t reads; /* number of read operations */ + uint64_t writes; /* number of write operations */ + uint64_t nreserved; /* number of bytes reserved */ + hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ + list_node_t sth_link; +} spa_txg_history_t; + +static int +spa_txg_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " + "%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state", + "nreserved", "nread", "nwritten", "reads", "writes", + "otime", "qtime", "stime"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_txg_history_data(char *buf, size_t size, void *data) +{ + spa_txg_history_t *sth = (spa_txg_history_t *)data; + uint64_t open = 0, quiesce = 0, sync = 0; + char state; + + switch (sth->state) { + case TXG_STATE_BIRTH: state = 'B'; break; + case TXG_STATE_OPEN: state = 'O'; break; + case TXG_STATE_QUIESCED: state = 'Q'; break; + case TXG_STATE_SYNCED: state = 'S'; break; + case TXG_STATE_COMMITTED: state = 'C'; break; + default: state = '?'; break; + } + + if (sth->times[TXG_STATE_OPEN]) + open = sth->times[TXG_STATE_OPEN] - + sth->times[TXG_STATE_BIRTH]; + + if (sth->times[TXG_STATE_QUIESCED]) + quiesce = sth->times[TXG_STATE_QUIESCED] - + sth->times[TXG_STATE_OPEN]; + + if (sth->times[TXG_STATE_SYNCED]) + sync = sth->times[TXG_STATE_SYNCED] - + sth->times[TXG_STATE_QUIESCED]; + + size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " + "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n", + (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, + (u_longlong_t)sth->nreserved, + (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, + (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, + (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_txg_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_txg_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_txg_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (rw == KSTAT_WRITE) { + spa_txg_history_t *sth; + + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t); + + return (0); +} + +static void +spa_txg_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_txg_history_t), + offsetof(spa_txg_history_t, sth_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "txgs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_txg_history_update; + kstat_set_raw_ops(ksp, spa_txg_history_headers, + spa_txg_history_data, spa_txg_history_addr); + kstat_install(ksp); + } +} + +static void +spa_txg_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +/* + * Add a new txg to historical record. + */ +void +spa_txg_history_add(spa_t *spa, uint64_t txg) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth, *rm; + + if (zfs_txg_history == 0 && ssh->size == 0) + return; + + sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE); + sth->txg = txg; + sth->state = TXG_STATE_OPEN; + sth->times[TXG_STATE_BIRTH] = gethrtime(); + + mutex_enter(&ssh->lock); + + list_insert_head(&ssh->list, sth); + ssh->size++; + + while (ssh->size > zfs_txg_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_txg_history_t)); + } + + mutex_exit(&ssh->lock); +} + +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->times[completed_state] = completed_time; + sth->state++; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + +/* + * Set txg IO stats. + */ +int +spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->nread = nread; + sth->nwritten = nwritten; + sth->reads = reads; + sth->writes = writes; + sth->nreserved = nreserved; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); + spa_txg_history_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_txg_history_destroy(spa); spa_read_history_destroy(spa); } @@ -245,4 +524,7 @@ MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); module_param(zfs_read_history_hits, int, 0644); MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); + +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); #endif diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 440353a50..7a3da8647 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -363,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime()); + spa_txg_history_add(dp->dp_spa, tx->tx_open_txg); + /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. @@ -380,6 +384,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } + + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); } static void @@ -451,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp) spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; + vdev_stat_t *vs1, *vs2; uint64_t start, delta; #ifdef _KERNEL @@ -464,6 +471,9 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_enter(tx, &cpr); + vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + start = delta = 0; for (;;) { uint64_t timer, timeout; @@ -499,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } - if (tx->tx_exiting) + if (tx->tx_exiting) { + kmem_free(vs2, sizeof(vdev_stat_t)); + kmem_free(vs1, sizeof(vdev_stat_t)); txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + } + + vdev_get_stats(spa->spa_root_vdev, vs1); /* * Consume the quiesced txg which has been handed off to @@ -529,6 +544,16 @@ txg_sync_thread(dsl_pool_t *dp) * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); + + vdev_get_stats(spa->spa_root_vdev, vs2); + spa_txg_history_set_io(spa, txg, + vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], + vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], + vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], + vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], + dp->dp_space_towrite[txg & TXG_MASK] + + dp->dp_tempreserved[txg & TXG_MASK] / 2); + spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); } } -- cgit v1.2.3