7 files changed, 277 insertions, 73 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index d8d1e3a23..2c27cf65c 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -27,6 +27,7 @@ $(MODULE)-objs += dbuf.o
 $(MODULE)-objs += dbuf_stats.o
 $(MODULE)-objs += bptree.o
 $(MODULE)-objs += bqueue.o
+$(MODULE)-objs += dataset_kstats.o
 $(MODULE)-objs += ddt.o
 $(MODULE)-objs += ddt_zap.o
 $(MODULE)-objs += dmu.o
diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
new file mode 100644
index 000000000..ac0ad84ed
--- /dev/null
+++ b/module/zfs/dataset_kstats.c
@@ -0,0 +1,185 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static dataset_kstat_values_t empty_dataset_kstats = {
+	{ "dataset_name",	KSTAT_DATA_STRING },
+	{ "writes",	KSTAT_DATA_UINT64 },
+	{ "nwritten",	KSTAT_DATA_UINT64 },
+	{ "reads",	KSTAT_DATA_UINT64 },
+	{ "nread",	KSTAT_DATA_UINT64 },
+};
+
+static int
+dataset_kstats_update(kstat_t *ksp, int rw)
+{
+	dataset_kstats_t *dk = ksp->ks_private;
+	ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	dkv->dkv_writes.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_writes);
+	dkv->dkv_nwritten.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nwritten);
+	dkv->dkv_reads.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_reads);
+	dkv->dkv_nread.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nread);
+
+	return (0);
+}
+
+void
+dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
+{
+	/*
+	 * There should not be anything wrong with having kstats for
+	 * snapshots. Since we are not sure how useful they would be
+	 * though nor how much their memory overhead would matter in
+	 * a filesystem with many snapshots, we skip them for now.
+	 */
+	if (dmu_objset_is_snapshot(objset))
+		return;
+
+	/*
+	 * At the time of this writing, KSTAT_STRLEN is 255 in Linux,
+	 * and the spa_name can theoretically be up to 256 characters.
+	 * In reality though the spa_name can be 240 characters max
+	 * [see origin directory name check in pool_namecheck()]. Thus,
+	 * the naming scheme for the module name below should not cause
+	 * any truncations. In the event that a truncation does happen
+	 * though, due to some future change, we silently skip creating
+	 * the kstat and log the event.
+	 */
+	char kstat_module_name[KSTAT_STRLEN];
+	int n = snprintf(kstat_module_name, sizeof (kstat_module_name),
+	    "zfs/%s", spa_name(dmu_objset_spa(objset)));
+	if (n < 0) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    " snprintf() for kstat module name returned %d",
+		    (unsigned long long)dmu_objset_id(objset), n);
+		return;
+	} else if (n >= KSTAT_STRLEN) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    "kstat module name length (%d) exceeds limit (%d)",
+		    (unsigned long long)dmu_objset_id(objset),
+		    n, KSTAT_STRLEN);
+		return;
+	}
+
+	char kstat_name[KSTAT_STRLEN];
+	n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx",
+	    (unsigned long long)dmu_objset_id(objset));
+	if (n < 0) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    " snprintf() for kstat name returned %d",
+		    (unsigned long long)dmu_objset_id(objset), n);
+		return;
+	}
+	ASSERT3U(n, <, KSTAT_STRLEN);
+
+	kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
+	    "dataset", KSTAT_TYPE_NAMED,
+	    sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (kstat == NULL)
+		return;
+
+	dataset_kstat_values_t *dk_kstats =
+	    kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
+	bcopy(&empty_dataset_kstats, dk_kstats,
+	    sizeof (empty_dataset_kstats));
+
+	char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	dsl_dataset_name(objset->os_dsl_dataset, ds_name);
+	KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name;
+	KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) =
+	    ZFS_MAX_DATASET_NAME_LEN;
+
+	kstat->ks_data = dk_kstats;
+	kstat->ks_update = dataset_kstats_update;
+	kstat->ks_private = dk;
+
+	kstat_install(kstat);
+	dk->dk_kstats = kstat;
+
+	aggsum_init(&dk->dk_aggsums.das_writes, 0);
+	aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
+	aggsum_init(&dk->dk_aggsums.das_reads, 0);
+	aggsum_init(&dk->dk_aggsums.das_nread, 0);
+}
+
+void
+dataset_kstats_destroy(dataset_kstats_t *dk)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
+	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
+	kmem_free(dkv, sizeof (empty_dataset_kstats));
+
+	kstat_delete(dk->dk_kstats);
+	dk->dk_kstats = NULL;
+
+	aggsum_fini(&dk->dk_aggsums.das_writes);
+	aggsum_fini(&dk->dk_aggsums.das_nwritten);
+	aggsum_fini(&dk->dk_aggsums.das_reads);
+	aggsum_fini(&dk->dk_aggsums.das_nread);
+}
+
+void
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
+    int64_t nwritten)
+{
+	ASSERT3S(nwritten, >=, 0);
+
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_writes, 1);
+	aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten);
+}
+
+void
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
+    int64_t nread)
+{
+	ASSERT3S(nread, >=, 0);
+
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_reads, 1);
+	aggsum_add(&dk->dk_aggsums.das_nread, nread);
+}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0d0cb556c..39f329bea 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -306,6 +306,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
+		spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
+		    NULL, spa_load_guid(spa), src);
 	}
 
 	if (pool != NULL) {
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index a477c8669..488eaa4f2 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -1256,6 +1256,9 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 		/* restore readonly bit */
 		if (readonly != 0)
 			readonly_changed_cb(zfsvfs, B_TRUE);
+
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
 	}
 
 	/*
@@ -1288,6 +1291,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
 	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
 	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
 	zfsvfs_vfs_free(zfsvfs->z_vfs);
+	dataset_kstats_destroy(&zfsvfs->z_kstat);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 35bbd884b..4e163e2e3 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
@@ -438,15 +438,10 @@ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 int
 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
-	znode_t		*zp = ITOZ(ip);
-	zfsvfs_t	*zfsvfs = ITOZSB(ip);
-	ssize_t		n, nbytes;
-	int		error = 0;
-	rl_t		*rl;
-#ifdef HAVE_UIO_ZEROCOPY
-	xuio_t		*xuio = NULL;
-#endif /* HAVE_UIO_ZEROCOPY */
+	int error = 0;
 
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
@@ -482,8 +477,8 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	/*
 	 * Lock the range against changes.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, uio->uio_loffset, uio->uio_resid,
-	    RL_READER);
+	rl_t *rl = zfs_range_lock(&zp->z_range_lock,
+	    uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
@@ -495,9 +490,11 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
-	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+	ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+	ssize_t start_resid = n;
 
 #ifdef HAVE_UIO_ZEROCOPY
+	xuio_t *xuio = NULL;
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
@@ -529,7 +526,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 #endif /* HAVE_UIO_ZEROCOPY */
 
 	while (n > 0) {
-		nbytes = MIN(n, zfs_read_chunk_size -
+		ssize_t nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 		if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
@@ -548,6 +545,10 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 
 		n -= nbytes;
 	}
+
+	int64_t nread = start_resid - n;
+	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+	task_io_account_read(nread);
 out:
 	zfs_range_unlock(rl);
 
@@ -578,46 +579,28 @@ out:
 int
 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
-	znode_t		*zp = ITOZ(ip);
-	rlim64_t	limit = uio->uio_limit;
-	ssize_t		start_resid = uio->uio_resid;
-	ssize_t		tx_bytes;
-	uint64_t	end_size;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
-	zilog_t		*zilog;
-	offset_t	woff;
-	ssize_t		n, nbytes;
-	rl_t		*rl;
-	int		max_blksz = zfsvfs->z_max_blksz;
-	int		error = 0;
-	arc_buf_t	*abuf;
-	const iovec_t	*aiov = NULL;
-	xuio_t		*xuio = NULL;
-	int		write_eof;
-	int		count = 0;
-	sa_bulk_attr_t	bulk[4];
-	uint64_t	mtime[2], ctime[2];
-	uint32_t	uid;
-#ifdef HAVE_UIO_ZEROCOPY
-	int		i_iov = 0;
-	const iovec_t	*iovp = uio->uio_iov;
-	ASSERTV(int	iovcnt = uio->uio_iovcnt);
-#endif
+	int error = 0;
+	ssize_t start_resid = uio->uio_resid;
 
 	/*
 	 * Fasttrack empty write
 	 */
-	n = start_resid;
+	ssize_t n = start_resid;
 	if (n == 0)
 		return (0);
 
+	rlim64_t limit = uio->uio_limit;
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+	uint64_t mtime[2], ctime[2];
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
@@ -644,17 +627,18 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		return (SET_ERROR(EPERM));
 	}
 
-	zilog = zfsvfs->z_log;
-
 	/*
 	 * Validate file offset
 	 */
-	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+	offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
+	int max_blksz = zfsvfs->z_max_blksz;
+	xuio_t *xuio = NULL;
+
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
@@ -668,6 +652,8 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 #endif
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 
+	rl_t	 *rl;
+
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
@@ -706,9 +692,16 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
-	write_eof = (woff + n > zp->z_size);
+	int write_eof = (woff + n > zp->z_size);
+
+	uint64_t end_size = MAX(zp->z_size, woff + n);
+	zilog_t *zilog = zfsvfs->z_log;
+#ifdef HAVE_UIO_ZEROCOPY
+	int		i_iov = 0;
+	const iovec_t	*iovp = uio->uio_iov;
+	ASSERTV(int	iovcnt = uio->uio_iovcnt);
+#endif
 
-	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
@@ -716,8 +709,8 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
-		abuf = NULL;
 		woff = uio->uio_loffset;
+
 		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
 		    KUID_TO_SUID(ip->i_uid)) ||
 		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
@@ -725,13 +718,13 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		    (zp->z_projid != ZFS_DEFAULT_PROJID &&
 		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 		    zp->z_projid))) {
-			if (abuf != NULL)
-				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
-		if (xuio && abuf == NULL) {
+		arc_buf_t *abuf = NULL;
+		const iovec_t *aiov = NULL;
+		if (xuio) {
 #ifdef HAVE_UIO_ZEROCOPY
 			ASSERT(i_iov < iovcnt);
 			ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
@@ -743,8 +736,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 #endif
-		} else if (abuf == NULL && n >= max_blksz &&
-		    woff >= zp->z_size &&
+		} else if (n >= max_blksz && woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
@@ -771,7 +763,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		/*
 		 * Start a transaction.
 		 */
-		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
@@ -812,8 +804,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
-		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+		ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
+		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -873,7 +866,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
-		uid = KUID_TO_SUID(ip->i_uid);
+		uint32_t uid = KUID_TO_SUID(ip->i_uid);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
@@ -937,6 +930,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
+	int64_t nwritten = start_resid - uio->uio_resid;
+	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+	task_io_account_write(nwritten);
+
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c
index 5b6839dd4..91251f9e6 100644
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -242,7 +242,6 @@ zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
 
 	read = count - uio.uio_resid;
 	*ppos += read;
-	task_io_account_read(read);
 
 	return (read);
 }
@@ -339,7 +338,6 @@ zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
 
 	wrote = count - uio.uio_resid;
 	*ppos += wrote;
-	task_io_account_write(wrote);
 
 	return (wrote);
 }
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index ecba516fc..19bc1b18e 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -36,7 +36,7 @@
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /*
@@ -74,6 +74,7 @@
  * and zvol_release()->zvol_last_close() directly as well.
  */
 
+#include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
@@ -88,7 +89,9 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
+
 #include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
@@ -125,6 +128,7 @@ struct zvol_state {
 	dev_t			zv_dev;		/* device id */
 	struct gendisk		*zv_disk;	/* generic disk */
 	struct request_queue	*zv_queue;	/* request queue */
+	dataset_kstats_t	zv_kstat;	/* zvol kstats */
 	list_node_t		zv_next;	/* next zvol_state_t linkage */
 	uint64_t		zv_hash;	/* name hash */
 	struct hlist_node	zv_hlink;	/* hash link */
@@ -730,25 +734,25 @@ uio_from_bio(uio_t *uio, struct bio *bio)
 static void
 zvol_write(void *arg)
 {
+	int error = 0;
+
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	uio_t uio;
-	zvol_state_t *zv = zvr->zv;
-	uint64_t volsize = zv->zv_volsize;
-	boolean_t sync;
-	int error = 0;
-	unsigned long start_jif;
-
 	uio_from_bio(&uio, bio);
 
+	zvol_state_t *zv = zvr->zv;
 	ASSERT(zv && zv->zv_open_count > 0);
 
-	start_jif = jiffies;
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
-	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+	boolean_t sync =
+	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
+	uint64_t volsize = zv->zv_volsize;
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio.uio_loffset;
@@ -766,14 +770,20 @@ zvol_write(void *arg)
 			break;
 		}
 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
-		if (error == 0)
+		if (error == 0) {
 			zvol_log_write(zv, tx, off, bytes, sync);
+		}
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_range_unlock(zvr->rl);
+
+	int64_t nwritten = start_resid - uio.uio_resid;
+	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+	task_io_account_write(nwritten);
+
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
@@ -876,22 +886,22 @@ unlock:
 static void
 zvol_read(void *arg)
 {
+	int error = 0;
+
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	uio_t uio;
-	zvol_state_t *zv = zvr->zv;
-	uint64_t volsize = zv->zv_volsize;
-	int error = 0;
-	unsigned long start_jif;
-
 	uio_from_bio(&uio, bio);
 
+	zvol_state_t *zv = zvr->zv;
 	ASSERT(zv && zv->zv_open_count > 0);
 
-	start_jif = jiffies;
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
+	uint64_t volsize = zv->zv_volsize;
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 
@@ -909,6 +919,10 @@ zvol_read(void *arg)
 	}
 	zfs_range_unlock(zvr->rl);
 
+	int64_t nread = start_resid - uio.uio_resid;
+	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+	task_io_account_read(nread);
+
 	rw_exit(&zv->zv_suspend_lock);
 	blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
 	    start_jif);
@@ -1741,6 +1755,7 @@ zvol_free(void *arg)
 	ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
 
 	mutex_destroy(&zv->zv_state_lock);
+	dataset_kstats_destroy(&zv->zv_kstat);
 
 	kmem_free(zv, sizeof (zvol_state_t));
 }
@@ -1831,6 +1846,8 @@ zvol_create_minor_impl(const char *name)
 		else
 			zil_replay(os, zv, zvol_replay_vector);
 	}
+	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 
 	/*
 	 * When udev detects the addition of the device it will immediately