diff options
author | Brian Behlendorf <[email protected]> | 2012-11-06 14:44:07 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2012-11-06 14:48:06 -0800 |
commit | bbf8c748051294b104678f101ba1162632099b07 (patch) | |
tree | 4752e52379492cd45bdb5cd930496723dabed269 | |
parent | e8fd45a0f975c6b8ae8cd644714fc21f14fac2bf (diff) | |
parent | 9dcb97198338ba2d8764dd5604b278118612f74d (diff) |
Merge remote branch 'eris/stats'
Bring in support for the new KSTAT_TYPE_TXG type. This allows for
additional visibility in to the txg handling.
Signed-off-by: Brian Behlendorf <[email protected]>
-rw-r--r-- | include/sys/dsl_pool.h | 15 | ||||
-rw-r--r-- | include/sys/fm/fs/zfs.h | 2 | ||||
-rw-r--r-- | lib/libspl/include/sys/kstat.h | 27 | ||||
-rw-r--r-- | module/zfs/dsl_pool.c | 144 | ||||
-rw-r--r-- | module/zfs/txg.c | 46 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 4 | ||||
-rw-r--r-- | module/zfs/zio.c | 32 |
7 files changed, 259 insertions, 11 deletions
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 99a131eec..40e96101d 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -65,6 +65,13 @@ typedef struct zfs_all_blkstats { zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; } zfs_all_blkstats_t; +typedef struct txg_history { + kstat_txg_t th_kstat; + vdev_stat_t th_vs1; + vdev_stat_t th_vs2; + kmutex_t th_lock; + list_node_t th_link; +} txg_history_t; typedef struct dsl_pool { /* Immutable */ @@ -76,6 +83,7 @@ typedef struct dsl_pool { struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_iput_taskq; + kstat_t *dp_txg_kstat; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; @@ -92,6 +100,9 @@ typedef struct dsl_pool { kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; + uint64_t dp_txg_history_size; + list_t dp_txg_history; + /* Has its own locking */ tx_state_t dp_tx; @@ -144,6 +155,10 @@ extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); +txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); +txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_txg_history_put(txg_history_t *th); + #ifdef __cplusplus } #endif diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 6e652df98..15803c034 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -83,6 +83,8 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h index fcd3ed98b..6bd2ec878 100644 --- a/lib/libspl/include/sys/kstat.h +++ b/lib/libspl/include/sys/kstat.h @@ -228,8 +228,10 @@ typedef struct kstat32 { /* ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer */ /* ks_ndata >= 1 */ +#define KSTAT_TYPE_TXG 5 /* txg statistics */ + /* ks_ndata >= 0 */ -#define KSTAT_NUM_TYPES 5 +#define KSTAT_NUM_TYPES 6 /* * kstat class @@ -698,6 +700,29 @@ typedef struct kstat_timer { #define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data) +/* + * TXG statistics - bytes read/written and iops performed + */ +typedef enum kstat_txg_state { + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCING = 2, + TXG_STATE_SYNCING = 3, + TXG_STATE_COMMITTED = 4, +} kstat_txg_state_t; + +typedef struct kstat_txg { + u_longlong_t txg; /* txg id */ + kstat_txg_state_t state; /* txg state */ + hrtime_t birth; /* birth time stamp */ + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t open_time; /* open time */ + hrtime_t quiesce_time; /* quiesce time */ + hrtime_t sync_time; /* sync time */ +} kstat_txg_t; + #if defined(_KERNEL) #include <sys/t_lock.h> diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 53419778a..7e0fba589 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -44,6 +44,7 @@ int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ +int zfs_txg_history = 60; /* statistics for the last N txgs */ unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ @@ -54,6 +55,143 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; +static int +dsl_pool_txg_history_update(kstat_t *ksp, int rw) +{ + dsl_pool_t *dp = ksp->ks_private; + txg_history_t *th; + int i = 0; + + if (rw == KSTAT_WRITE) + return (EACCES); + + if (ksp->ks_data) + kmem_free(ksp->ks_data, ksp->ks_data_size); + + mutex_enter(&dp->dp_lock); + + ksp->ks_ndata = dp->dp_txg_history_size; + ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); + if (ksp->ks_data_size > 0) + ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); + + /* Traversed oldest to youngest for the most readable kstat output */ + for (th = list_tail(&dp->dp_txg_history); th != NULL; + th = list_prev(&dp->dp_txg_history, th)) { + mutex_enter(&th->th_lock); + ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); + memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); + i += sizeof(kstat_txg_t); + mutex_exit(&th->th_lock); + } + + mutex_exit(&dp->dp_lock); + + return (0); +} + +static void +dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) +{ + char name[KSTAT_STRLEN]; + + list_create(&dp->dp_txg_history, sizeof (txg_history_t), + offsetof(txg_history_t, th_link)); + dsl_pool_txg_history_add(dp, txg); + + (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); + dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", + KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); + if (dp->dp_txg_kstat) { + dp->dp_txg_kstat->ks_data = NULL; + dp->dp_txg_kstat->ks_private = dp; + dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; + kstat_install(dp->dp_txg_kstat); + } +} + +static void +dsl_pool_txg_history_destroy(dsl_pool_t *dp) +{ + txg_history_t *th; + + if (dp->dp_txg_kstat) { + if (dp->dp_txg_kstat->ks_data) + kmem_free(dp->dp_txg_kstat->ks_data, + dp->dp_txg_kstat->ks_data_size); + + kstat_delete(dp->dp_txg_kstat); + } + + mutex_enter(&dp->dp_lock); + while ((th = list_remove_head(&dp->dp_txg_history))) { + dp->dp_txg_history_size--; + mutex_destroy(&th->th_lock); + kmem_free(th, sizeof(txg_history_t)); + } + + ASSERT3U(dp->dp_txg_history_size, ==, 0); + list_destroy(&dp->dp_txg_history); + mutex_exit(&dp->dp_lock); +} + +txg_history_t * +dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) +{ + txg_history_t *th, *rm; + + th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP); + mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); + th->th_kstat.txg = txg; + th->th_kstat.state = TXG_STATE_OPEN; + th->th_kstat.birth = gethrtime(); + + mutex_enter(&dp->dp_lock); + + list_insert_head(&dp->dp_txg_history, th); + dp->dp_txg_history_size++; + + while (dp->dp_txg_history_size > zfs_txg_history) { + dp->dp_txg_history_size--; + rm = list_remove_tail(&dp->dp_txg_history); + mutex_destroy(&rm->th_lock); + kmem_free(rm, sizeof(txg_history_t)); + } + + mutex_exit(&dp->dp_lock); + + return (th); +} + +/* + * Traversed youngest to oldest because lookups are only done for open + * or syncing txgs which are guaranteed to be at the head of the list. + * The txg_history_t structure will be returned locked. + */ +txg_history_t * +dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) +{ + txg_history_t *th; + + mutex_enter(&dp->dp_lock); + for (th = list_head(&dp->dp_txg_history); th != NULL; + th = list_next(&dp->dp_txg_history, th)) { + if (th->th_kstat.txg == txg) { + mutex_enter(&th->th_lock); + break; + } + } + mutex_exit(&dp->dp_lock); + + return (th); +} + +void +dsl_pool_txg_history_put(txg_history_t *th) +{ + mutex_exit(&th->th_lock); +} + int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { @@ -96,6 +234,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri, 1, 4, 0); + dsl_pool_txg_history_init(dp, txg); + return (dp); } @@ -213,6 +353,7 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); + dsl_pool_txg_history_destroy(dp); rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); @@ -864,6 +1005,9 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); module_param(zfs_txg_synctime_ms, int, 0644); MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); + module_param(zfs_write_limit_min, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 5702a61f0..17494bcda 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -29,6 +29,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/callb.h> +#include <sys/spa_impl.h> /* * Pool-wide transaction groups. @@ -279,6 +280,8 @@ txg_rele_to_sync(txg_handle_t *th) static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { + hrtime_t start; + txg_history_t *th; tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; @@ -293,6 +296,15 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) tx->tx_open_txg++; /* + * Measure how long the txg was open and replace the kstat. + */ + th = dsl_pool_txg_history_get(dp, txg); + th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; + th->th_kstat.state = TXG_STATE_QUIESCING; + dsl_pool_txg_history_put(th); + dsl_pool_txg_history_add(dp, tx->tx_open_txg); + + /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. */ @@ -302,6 +314,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ + start = gethrtime(); + for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; mutex_enter(&tc->tc_lock); @@ -309,6 +323,13 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } + + /* + * Measure how long the txg took to quiesce. + */ + th = dsl_pool_txg_history_get(dp, txg); + th->th_kstat.quiesce_time = gethrtime() - start; + dsl_pool_txg_history_put(th); } static void @@ -395,6 +416,8 @@ txg_sync_thread(dsl_pool_t *dp) start = delta = 0; for (;;) { + hrtime_t hrstart; + txg_history_t *th; uint64_t timer, timeout; uint64_t txg; @@ -441,11 +464,17 @@ txg_sync_thread(dsl_pool_t *dp) tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); + th = dsl_pool_txg_history_get(dp, txg); + th->th_kstat.state = TXG_STATE_SYNCING; + vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); + dsl_pool_txg_history_put(th); + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); + hrstart = gethrtime(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; @@ -458,6 +487,23 @@ txg_sync_thread(dsl_pool_t *dp) * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); + + /* + * Measure the txg sync time determine the amount of I/O done. + */ + th = dsl_pool_txg_history_get(dp, txg); + vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); + th->th_kstat.sync_time = gethrtime() - hrstart; + th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - + th->th_vs1.vs_bytes[ZIO_TYPE_READ]; + th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - + th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; + th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - + th->th_vs1.vs_ops[ZIO_TYPE_READ]; + th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - + th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; + th->th_kstat.state = TXG_STATE_COMMITTED; + dsl_pool_txg_history_put(th); } } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 0b98231ec..db6a831d2 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -294,6 +294,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_INT32, zio->io_error, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, DATA_TYPE_INT32, zio->io_flags, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, + DATA_TYPE_UINT32, zio->io_stage, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, + DATA_TYPE_UINT32, zio->io_pipeline, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, DATA_TYPE_UINT64, zio->io_delay, NULL); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ece3329d0..bcfc081d1 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1305,18 +1305,34 @@ __zio_execute(zio_t *zio) int zio_wait(zio_t *zio) { + uint64_t timeout; int error; ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; + timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz); __zio_execute(zio); mutex_enter(&zio->io_lock); - while (zio->io_executor != NULL) - cv_wait(&zio->io_cv, &zio->io_lock); + while (zio->io_executor != NULL) { + /* + * Wake up periodically to prevent the kernel from complaining + * about a blocked task. However, check zio_delay_max to see + * if the I/O has exceeded the timeout and post an ereport. + */ + cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock, + ddi_get_lbolt() + hz); + + if (timeout && (ddi_get_lbolt() > timeout)) { + zio->io_delay = zio_delay_max; + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio, 0, 0); + timeout = 0; + } + } mutex_exit(&zio->io_lock); error = zio->io_error; @@ -2889,15 +2905,11 @@ zio_done(zio_t *zio) vdev_stat_update(zio, zio->io_size); /* - * If this I/O is attached to a particular vdev is slow, exeeding - * 30 seconds to complete, post an error described the I/O delay. - * We ignore these errors if the device is currently unavailable. + * When an I/O completes but was slow post an ereport. */ - if (zio->io_delay >= zio_delay_max) { - if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, - zio->io_vd, zio, 0, 0); - } + if (zio->io_delay >= zio_delay_max) + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, + zio->io_vd, zio, 0, 0); if (zio->io_error) { /* |