From e95853a331529a6cb96fdf10476c53441e59f4e1 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 23 Oct 2012 13:48:22 -0700
Subject: Add txgs-<pool> kstat file

Create a kstat file which contains useful statistics about the
last N txgs processed.  This can be helpful when analyzing pool
performance.  The new KSTAT_TYPE_TXG type was added for this
purpose and it tracks the following statistics per-txg.

  txg          - Unique txg number
  state        - State (O)pen/(Q)uiescing/(S)yncing/(C)ommitted
  birth;       - Creation time
  nread        - Bytes read
  nwritten;    - Bytes written
  reads        - IOPs read
  writes       - IOPs write
  open_time;   - Length in nanoseconds the txg was open
  quiesce_time - Length in nanoseconds the txg was quiescing
  sync_time;   - Length in nanoseconds the txg was syncing

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/dsl_pool.h         |  15 +++++
 lib/libspl/include/sys/kstat.h |  27 +++++++-
 module/zfs/dsl_pool.c          | 144 +++++++++++++++++++++++++++++++++++++++++
 module/zfs/txg.c               |  46 +++++++++++++
 4 files changed, 231 insertions(+), 1 deletion(-)
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 99a131eec..40e96101d 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -65,6 +65,13 @@ typedef struct zfs_all_blkstats {
 	zfs_blkstat_t	zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
 } zfs_all_blkstats_t;
 
+typedef struct txg_history {
+	kstat_txg_t	th_kstat;
+	vdev_stat_t	th_vs1;
+	vdev_stat_t	th_vs2;
+	kmutex_t	th_lock;
+	list_node_t	th_link;
+} txg_history_t;
 
 typedef struct dsl_pool {
 	/* Immutable */
@@ -76,6 +83,7 @@ typedef struct dsl_pool {
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_iput_taskq;
+	kstat_t *dp_txg_kstat;
 
 	/* No lock needed - sync context only */
 	blkptr_t dp_meta_rootbp;
@@ -92,6 +100,9 @@ typedef struct dsl_pool {
 	kmutex_t dp_lock;
 	uint64_t dp_space_towrite[TXG_SIZE];
 	uint64_t dp_tempreserved[TXG_SIZE];
+	uint64_t dp_txg_history_size;
+	list_t dp_txg_history;
+
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
@@ -144,6 +155,10 @@ extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
 extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
 int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
 
+txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg);
+txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_txg_history_put(txg_history_t *th);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h
index fcd3ed98b..6bd2ec878 100644
--- a/lib/libspl/include/sys/kstat.h
+++ b/lib/libspl/include/sys/kstat.h
@@ -228,8 +228,10 @@ typedef struct kstat32 {
 					/* ks_ndata == 1 */
 #define	KSTAT_TYPE_TIMER	4	/* event timer */
 					/* ks_ndata >= 1 */
+#define	KSTAT_TYPE_TXG		5	/* txg statistics */
+					/* ks_ndata >= 0 */
 
-#define	KSTAT_NUM_TYPES		5
+#define	KSTAT_NUM_TYPES		6
 
 /*
  * kstat class
@@ -698,6 +700,29 @@ typedef struct kstat_timer {
 
 #define	KSTAT_TIMER_PTR(kptr)	((kstat_timer_t *)(kptr)->ks_data)
 
+/*
+ * TXG statistics - bytes read/written and iops performed
+ */
+typedef enum kstat_txg_state {
+	TXG_STATE_OPEN      = 1,
+	TXG_STATE_QUIESCING = 2,
+	TXG_STATE_SYNCING   = 3,
+	TXG_STATE_COMMITTED = 4,
+} kstat_txg_state_t;
+
+typedef struct kstat_txg {
+	u_longlong_t		txg;		/* txg id */
+	kstat_txg_state_t	state;		/* txg state */
+	hrtime_t		birth;		/* birth time stamp */
+	u_longlong_t		nread;		/* number of bytes read */
+	u_longlong_t		nwritten;	/* number of bytes written */
+	uint_t			reads;		/* number of read operations */
+	uint_t			writes;		/* number of write operations */
+	hrtime_t		open_time;	/* open time */
+	hrtime_t		quiesce_time;	/* quiesce time */
+	hrtime_t		sync_time;	/* sync time */
+} kstat_txg_t;
+
 #if	defined(_KERNEL)
 
 #include <sys/t_lock.h>
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 53419778a..7e0fba589 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -44,6 +44,7 @@
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
 int zfs_txg_synctime_ms = 1000;		/* target millisecs to sync a txg */
+int zfs_txg_history = 60;		/* statistics for the last N txgs */
 
 unsigned long zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
 unsigned long zfs_write_limit_max = 0;		/* max data payload per txg */
@@ -54,6 +55,143 @@ kmutex_t zfs_write_limit_lock;
 
 static pgcnt_t old_physmem = 0;
 
+static int
+dsl_pool_txg_history_update(kstat_t *ksp, int rw)
+{
+	dsl_pool_t *dp = ksp->ks_private;
+	txg_history_t *th;
+	int i = 0;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	if (ksp->ks_data)
+		kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+	mutex_enter(&dp->dp_lock);
+
+	ksp->ks_ndata = dp->dp_txg_history_size;
+	ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t);
+	if (ksp->ks_data_size > 0)
+		ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE);
+
+	/* Traversed oldest to youngest for the most readable kstat output */
+	for (th = list_tail(&dp->dp_txg_history); th != NULL;
+	     th = list_prev(&dp->dp_txg_history, th)) {
+		mutex_enter(&th->th_lock);
+		ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size);
+		memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t));
+		i += sizeof(kstat_txg_t);
+		mutex_exit(&th->th_lock);
+	}
+
+	mutex_exit(&dp->dp_lock);
+
+	return (0);
+}
+
+static void
+dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg)
+{
+	char name[KSTAT_STRLEN];
+
+	list_create(&dp->dp_txg_history, sizeof (txg_history_t),
+	    offsetof(txg_history_t, th_link));
+	dsl_pool_txg_history_add(dp, txg);
+
+	(void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa));
+	dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc",
+	    KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL);
+	if (dp->dp_txg_kstat) {
+		dp->dp_txg_kstat->ks_data = NULL;
+		dp->dp_txg_kstat->ks_private = dp;
+		dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update;
+		kstat_install(dp->dp_txg_kstat);
+	}
+}
+
+static void
+dsl_pool_txg_history_destroy(dsl_pool_t *dp)
+{
+	txg_history_t *th;
+
+	if (dp->dp_txg_kstat) {
+		if (dp->dp_txg_kstat->ks_data)
+			kmem_free(dp->dp_txg_kstat->ks_data,
+			    dp->dp_txg_kstat->ks_data_size);
+
+		kstat_delete(dp->dp_txg_kstat);
+	}
+
+	mutex_enter(&dp->dp_lock);
+	while ((th = list_remove_head(&dp->dp_txg_history))) {
+		dp->dp_txg_history_size--;
+		mutex_destroy(&th->th_lock);
+		kmem_free(th, sizeof(txg_history_t));
+	}
+
+	ASSERT3U(dp->dp_txg_history_size, ==, 0);
+	list_destroy(&dp->dp_txg_history);
+	mutex_exit(&dp->dp_lock);
+}
+
+txg_history_t *
+dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg)
+{
+	txg_history_t *th, *rm;
+
+	th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP);
+	mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL);
+	th->th_kstat.txg = txg;
+	th->th_kstat.state = TXG_STATE_OPEN;
+	th->th_kstat.birth = gethrtime();
+
+	mutex_enter(&dp->dp_lock);
+
+	list_insert_head(&dp->dp_txg_history, th);
+	dp->dp_txg_history_size++;
+
+	while (dp->dp_txg_history_size > zfs_txg_history) {
+		dp->dp_txg_history_size--;
+		rm = list_remove_tail(&dp->dp_txg_history);
+		mutex_destroy(&rm->th_lock);
+		kmem_free(rm, sizeof(txg_history_t));
+	}
+
+	mutex_exit(&dp->dp_lock);
+
+	return (th);
+}
+
+/*
+ * Traversed youngest to oldest because lookups are only done for open
+ * or syncing txgs which are guaranteed to be at the head of the list.
+ * The txg_history_t structure will be returned locked.
+ */
+txg_history_t *
+dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg)
+{
+	txg_history_t *th;
+
+	mutex_enter(&dp->dp_lock);
+	for (th = list_head(&dp->dp_txg_history); th != NULL;
+	     th = list_next(&dp->dp_txg_history, th)) {
+		if (th->th_kstat.txg == txg) {
+			mutex_enter(&th->th_lock);
+			break;
+		}
+	}
+	mutex_exit(&dp->dp_lock);
+
+	return (th);
+}
+
+void
+dsl_pool_txg_history_put(txg_history_t *th)
+{
+	mutex_exit(&th->th_lock);
+}
+
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
@@ -96,6 +234,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
 	    1, 4, 0);
 
+	dsl_pool_txg_history_init(dp, txg);
+
 	return (dp);
 }
 
@@ -213,6 +353,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	arc_flush(dp->dp_spa);
 	txg_fini(dp);
 	dsl_scan_fini(dp);
+	dsl_pool_txg_history_destroy(dp);
 	rw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	taskq_destroy(dp->dp_iput_taskq);
@@ -864,6 +1005,9 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
 module_param(zfs_txg_synctime_ms, int, 0644);
 MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");
 
+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
+
 module_param(zfs_write_limit_min, ulong, 0444);
 MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");
 
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 5702a61f0..17494bcda 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -29,6 +29,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/spa_impl.h>
 
 /*
  * Pool-wide transaction groups.
@@ -279,6 +280,8 @@ txg_rele_to_sync(txg_handle_t *th)
 static void
 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 {
+	hrtime_t start;
+	txg_history_t *th;
 	tx_state_t *tx = &dp->dp_tx;
 	int g = txg & TXG_MASK;
 	int c;
@@ -292,6 +295,15 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
 
+	/*
+	 * Measure how long the txg was open and replace the kstat.
+	 */
+	th = dsl_pool_txg_history_get(dp, txg);
+	th->th_kstat.open_time = gethrtime() - th->th_kstat.birth;
+	th->th_kstat.state = TXG_STATE_QUIESCING;
+	dsl_pool_txg_history_put(th);
+	dsl_pool_txg_history_add(dp, tx->tx_open_txg);
+
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
@@ -302,6 +314,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().
 	 */
+	start = gethrtime();
+
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
 		mutex_enter(&tc->tc_lock);
@@ -309,6 +323,13 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 		mutex_exit(&tc->tc_lock);
 	}
+
+	/*
+	 * Measure how long the txg took to quiesce.
+	 */
+	th = dsl_pool_txg_history_get(dp, txg);
+	th->th_kstat.quiesce_time = gethrtime() - start;
+	dsl_pool_txg_history_put(th);
 }
 
 static void
@@ -395,6 +416,8 @@ txg_sync_thread(dsl_pool_t *dp)
 
 	start = delta = 0;
 	for (;;) {
+		hrtime_t hrstart;
+		txg_history_t *th;
 		uint64_t timer, timeout;
 		uint64_t txg;
 
@@ -441,11 +464,17 @@ txg_sync_thread(dsl_pool_t *dp)
 		tx->tx_syncing_txg = txg;
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 
+		th = dsl_pool_txg_history_get(dp, txg);
+		th->th_kstat.state = TXG_STATE_SYNCING;
+		vdev_get_stats(spa->spa_root_vdev, &th->th_vs1);
+		dsl_pool_txg_history_put(th);
+
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
 		start = ddi_get_lbolt();
+		hrstart = gethrtime();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
 
@@ -458,6 +487,23 @@ txg_sync_thread(dsl_pool_t *dp)
 		 * Dispatch commit callbacks to worker threads.
 		 */
 		txg_dispatch_callbacks(dp, txg);
+
+		/*
+		 * Measure the txg sync time determine the amount of I/O done.
+		 */
+		th = dsl_pool_txg_history_get(dp, txg);
+		vdev_get_stats(spa->spa_root_vdev, &th->th_vs2);
+		th->th_kstat.sync_time = gethrtime() - hrstart;
+		th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] -
+		    th->th_vs1.vs_bytes[ZIO_TYPE_READ];
+		th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] -
+		    th->th_vs1.vs_bytes[ZIO_TYPE_WRITE];
+		th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] -
+		    th->th_vs1.vs_ops[ZIO_TYPE_READ];
+		th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] -
+		    th->th_vs1.vs_ops[ZIO_TYPE_WRITE];
+		th->th_kstat.state = TXG_STATE_COMMITTED;
+		dsl_pool_txg_history_put(th);
 	}
 }
 
-- 
cgit v1.2.3


From 9dcb97198338ba2d8764dd5604b278118612f74d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 25 Oct 2012 13:02:31 -0700
Subject: Log I/Os longer than zio_delay_max (30s default)

There have been reports of ZFS deadlocking due to what appears to
be a lost IO.  This patch addes some debugging to determine the
exact state of the IO which neither 1) completed, 2) failed, or
3) timed out after zio_delay_max (30) seconds.

This information will be logged using the ZFS FMA infrastructure
as a 'delay' event and posted to the internal zevent log.  By
default the last 64 events will be kept in the log but the limit
is configurable via the zfs_zevent_len_max module option.

To dump the contents of the log use the 'zpool events -v' command
and look for the resource.fs.zfs.delay event.  It will include
various information about the pool, vdev, and zio which may shed
some light on the issue.

In the context of this change the 120 second kernel blocked thread
watchdog has been disabled for synchronous IOs.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #930
---
 include/sys/fm/fs/zfs.h |  2 ++
 module/zfs/zfs_fm.c     |  4 ++++
 module/zfs/zio.c        | 32 ++++++++++++++++++++++----------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index 6e652df98..15803c034 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -83,6 +83,8 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS	"zio_flags"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE	"zio_stage"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE	"zio_pipeline"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY	"zio_delay"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 0b98231ec..db6a831d2 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -294,6 +294,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
 		    DATA_TYPE_INT32, zio->io_flags, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
+		    DATA_TYPE_UINT32, zio->io_stage, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
+		    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
 		    DATA_TYPE_UINT64, zio->io_delay, NULL);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index ece3329d0..bcfc081d1 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1305,18 +1305,34 @@ __zio_execute(zio_t *zio)
 int
 zio_wait(zio_t *zio)
 {
+	uint64_t timeout;
 	int error;
 
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_executor == NULL);
 
 	zio->io_waiter = curthread;
+	timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz);
 
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
-	while (zio->io_executor != NULL)
-		cv_wait(&zio->io_cv, &zio->io_lock);
+	while (zio->io_executor != NULL) {
+		/*
+		 * Wake up periodically to prevent the kernel from complaining
+		 * about a blocked task.  However, check zio_delay_max to see
+		 * if the I/O has exceeded the timeout and post an ereport.
+		 */
+		cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock,
+		    ddi_get_lbolt() + hz);
+
+		if (timeout && (ddi_get_lbolt() > timeout)) {
+			zio->io_delay = zio_delay_max;
+			zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+			    zio->io_spa, zio->io_vd, zio, 0, 0);
+			timeout = 0;
+		}
+	}
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
@@ -2889,15 +2905,11 @@ zio_done(zio_t *zio)
 	vdev_stat_update(zio, zio->io_size);
 
 	/*
-	 * If this I/O is attached to a particular vdev is slow, exeeding
-	 * 30 seconds to complete, post an error described the I/O delay.
-	 * We ignore these errors if the device is currently unavailable.
+	 * When an I/O completes but was slow post an ereport.
 	 */
-	if (zio->io_delay >= zio_delay_max) {
-		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
-                                         zio->io_vd, zio, 0, 0);
-	}
+	if (zio->io_delay >= zio_delay_max)
+		zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
+		    zio->io_vd, zio, 0, 0);
 
 	if (zio->io_error) {
 		/*
-- 
cgit v1.2.3