Introduce read/write kstats per dataset

The following patch introduces a few statistics on reads and writes grouped by dataset. These statistics are implemented as kstats (backed by aggregate sums for performance) and can be retrieved by using the dataset objset ID number. The motivation for this change is to provide some preliminary analytics on dataset usage/performance. Reviewed-by: Richard Elling <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed by: Matthew Ahrens <[email protected]> Signed-off-by: Serapheim Dimitropoulos <[email protected]> Closes #7705
author: Serapheim Dimitropoulos <[email protected]> 2018-08-20 09:52:37 -0700
committer: Brian Behlendorf <[email protected]> 2018-08-20 09:52:37 -0700
commit: a448a2557ec4938ed6944c7766fe0b8e6e5f6456 (patch)
tree: 1d622c6c40aeb9c34d233ad562b2920ab2ef651c /module/zfs
parent: fa84714abbb9316208bef7188009ee74204d532e (diff)
7 files changed, 277 insertions, 73 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index d8d1e3a23..2c27cf65c 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -27,6 +27,7 @@ $(MODULE)-objs += dbuf.o
 $(MODULE)-objs += dbuf_stats.o
 $(MODULE)-objs += bptree.o
 $(MODULE)-objs += bqueue.o
+$(MODULE)-objs += dataset_kstats.o
 $(MODULE)-objs += ddt.o
 $(MODULE)-objs += ddt_zap.o
 $(MODULE)-objs += dmu.o
diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
new file mode 100644
index 000000000..ac0ad84ed
--- /dev/null
+++ b/module/zfs/dataset_kstats.c
@@ -0,0 +1,185 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static dataset_kstat_values_t empty_dataset_kstats = {
+	{ "dataset_name",	KSTAT_DATA_STRING },
+	{ "writes",	KSTAT_DATA_UINT64 },
+	{ "nwritten",	KSTAT_DATA_UINT64 },
+	{ "reads",	KSTAT_DATA_UINT64 },
+	{ "nread",	KSTAT_DATA_UINT64 },
+};
+
+static int
+dataset_kstats_update(kstat_t *ksp, int rw)
+{
+	dataset_kstats_t *dk = ksp->ks_private;
+	ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	dkv->dkv_writes.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_writes);
+	dkv->dkv_nwritten.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nwritten);
+	dkv->dkv_reads.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_reads);
+	dkv->dkv_nread.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nread);
+
+	return (0);
+}
+
+void
+dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
+{
+	/*
+	 * There should not be anything wrong with having kstats for
+	 * snapshots. Since we are not sure how useful they would be
+	 * though nor how much their memory overhead would matter in
+	 * a filesystem with many snapshots, we skip them for now.
+	 */
+	if (dmu_objset_is_snapshot(objset))
+		return;
+
+	/*
+	 * At the time of this writing, KSTAT_STRLEN is 255 in Linux,
+	 * and the spa_name can theoretically be up to 256 characters.
+	 * In reality though the spa_name can be 240 characters max
+	 * [see origin directory name check in pool_namecheck()]. Thus,
+	 * the naming scheme for the module name below should not cause
+	 * any truncations. In the event that a truncation does happen
+	 * though, due to some future change, we silently skip creating
+	 * the kstat and log the event.
+	 */
+	char kstat_module_name[KSTAT_STRLEN];
+	int n = snprintf(kstat_module_name, sizeof (kstat_module_name),
+	    "zfs/%s", spa_name(dmu_objset_spa(objset)));
+	if (n < 0) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    " snprintf() for kstat module name returned %d",
+		    (unsigned long long)dmu_objset_id(objset), n);
+		return;
+	} else if (n >= KSTAT_STRLEN) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    "kstat module name length (%d) exceeds limit (%d)",
+		    (unsigned long long)dmu_objset_id(objset),
+		    n, KSTAT_STRLEN);
+		return;
+	}
+
+	char kstat_name[KSTAT_STRLEN];
+	n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx",
+	    (unsigned long long)dmu_objset_id(objset));
+	if (n < 0) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    " snprintf() for kstat name returned %d",
+		    (unsigned long long)dmu_objset_id(objset), n);
+		return;
+	}
+	ASSERT3U(n, <, KSTAT_STRLEN);
+
+	kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
+	    "dataset", KSTAT_TYPE_NAMED,
+	    sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (kstat == NULL)
+		return;
+
+	dataset_kstat_values_t *dk_kstats =
+	    kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
+	bcopy(&empty_dataset_kstats, dk_kstats,
+	    sizeof (empty_dataset_kstats));
+
+	char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	dsl_dataset_name(objset->os_dsl_dataset, ds_name);
+	KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name;
+	KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) =
+	    ZFS_MAX_DATASET_NAME_LEN;
+
+	kstat->ks_data = dk_kstats;
+	kstat->ks_update = dataset_kstats_update;
+	kstat->ks_private = dk;
+
+	kstat_install(kstat);
+	dk->dk_kstats = kstat;
+
+	aggsum_init(&dk->dk_aggsums.das_writes, 0);
+	aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
+	aggsum_init(&dk->dk_aggsums.das_reads, 0);
+	aggsum_init(&dk->dk_aggsums.das_nread, 0);
+}
+
+void
+dataset_kstats_destroy(dataset_kstats_t *dk)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
+	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
+	kmem_free(dkv, sizeof (empty_dataset_kstats));
+
+	kstat_delete(dk->dk_kstats);
+	dk->dk_kstats = NULL;
+
+	aggsum_fini(&dk->dk_aggsums.das_writes);
+	aggsum_fini(&dk->dk_aggsums.das_nwritten);
+	aggsum_fini(&dk->dk_aggsums.das_reads);
+	aggsum_fini(&dk->dk_aggsums.das_nread);
+}
+
+void
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
+    int64_t nwritten)
+{
+	ASSERT3S(nwritten, >=, 0);
+
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_writes, 1);
+	aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten);
+}
+
+void
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
+    int64_t nread)
+{
+	ASSERT3S(nread, >=, 0);
+
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_reads, 1);
+	aggsum_add(&dk->dk_aggsums.das_nread, nread);
+}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0d0cb556c..39f329bea 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -306,6 +306,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
+		spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
+		    NULL, spa_load_guid(spa), src);
 	}
 
 	if (pool != NULL) {
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index a477c8669..488eaa4f2 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -1256,6 +1256,9 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 		/* restore readonly bit */
 		if (readonly != 0)
 			readonly_changed_cb(zfsvfs, B_TRUE);
+
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
 	}
 
 	/*
@@ -1288,6 +1291,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
 	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
 	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
 	zfsvfs_vfs_free(zfsvfs->z_vfs);
+	dataset_kstats_destroy(&zfsvfs->z_kstat);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 35bbd884b..4e163e2e3 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
@@ -438,15 +438,10 @@ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 int
 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
-	znode_t		*zp = ITOZ(ip);
-	zfsvfs_t	*zfsvfs = ITOZSB(ip);
-	ssize_t		n, nbytes;
-	int		error = 0;
-	rl_t		*rl;
-#ifdef HAVE_UIO_ZEROCOPY
-	xuio_t		*xuio = NULL;
-#endif /* HAVE_UIO_ZEROCOPY */
+	int error = 0;
 
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
@@ -482,8 +477,8 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	/*
 	 * Lock the range against changes.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, uio->uio_loffset, uio->uio_resid,
-	    RL_READER);
+	rl_t *rl = zfs_range_lock(&zp->z_range_lock,
+	    uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
@@ -495,9 +490,11 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
-	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+	ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+	ssize_t start_resid = n;
 
 #ifdef HAVE_UIO_ZEROCOPY
+	xuio_t *xuio = NULL;
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
@@ -529,7 +526,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 #endif /* HAVE_UIO_ZEROCOPY */
 
 	while (n > 0) {
-		nbytes = MIN(n, zfs_read_chunk_size -
+		ssize_t nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 		if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
@@ -548,6 +545,10 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 
 		n -= nbytes;
 	}
+
+	int64_t nread = start_resid - n;
+	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+	task_io_account_read(nread);
 out:
 	zfs_range_unlock(rl);
 
@@ -578,46 +579,28 @@ out:
 int
 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
-	znode_t		*zp = ITOZ(ip);
-	rlim64_t	limit = uio->uio_limit;
-	ssize_t		start_resid = uio->uio_resid;
-	ssize_t		tx_bytes;
-	uint64_t	end_size;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
-	zilog_t		*zilog;
-	offset_t	woff;
-	ssize_t		n, nbytes;
-	rl_t		*rl;
-	int		max_blksz = zfsvfs->z_max_blksz;
-	int		error = 0;
-	arc_buf_t	*abuf;
-	const iovec_t	*aiov = NULL;
-	xuio_t		*xuio = NULL;
-	int		write_eof;
-	int		count = 0;
-	sa_bulk_attr_t	bulk[4];
-	uint64_t	mtime[2], ctime[2];
-	uint32_t	uid;
-#ifdef HAVE_UIO_ZEROCOPY
-	int		i_iov = 0;
-	const iovec_t	*iovp = uio->uio_iov;
-	ASSERTV(int	iovcnt = uio->uio_iovcnt);
-#endif
+	int error = 0;
+	ssize_t start_resid = uio->uio_resid;
 
 	/*
 	 * Fasttrack empty write
 	 */
-	n = start_resid;
+	ssize_t n = start_resid;
 	if (n == 0)
 		return (0);
 
+	rlim64_t limit = uio->uio_limit;
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+	uint64_t mtime[2], ctime[2];
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
@@ -644,17 +627,18 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		return (SET_ERROR(EPERM));
 	}
 
-	zilog = zfsvfs->z_log;
-
 	/*
 	 * Validate file offset
 	 */
-	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+	offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
+	int max_blksz = zfsvfs->z_max_blksz;
+	xuio_t *xuio = NULL;
+
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
@@ -668,6 +652,8 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 #endif
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 
+	rl_t	 *rl;
+
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
@@ -706,9 +692,16 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
-	write_eof = (woff + n > zp->z_size);
+	int write_eof = (woff + n > zp->z_size);
+
+	uint64_t end_size = MAX(zp->z_size, woff + n);
+	zilog_t *zilog = zfsvfs->z_log;
+#ifdef HAVE_UIO_ZEROCOPY
+	int		i_iov = 0;
+	const iovec_t	*iovp = uio->uio_iov;
+	ASSERTV(int	iovcnt = uio->uio_iovcnt);
+#endif
 
-	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
@@ -716,8 +709,8 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
-		abuf = NULL;
 		woff = uio->uio_loffset;
+
 		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
 		    KUID_TO_SUID(ip->i_uid)) ||
 		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
@@ -725,13 +718,13 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		    (zp->z_projid != ZFS_DEFAULT_PROJID &&
 		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 		    zp->z_projid))) {
-			if (abuf != NULL)
-				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
-		if (xuio && abuf == NULL) {
+		arc_buf_t *abuf = NULL;
+		const iovec_t *aiov = NULL;
+		if (xuio) {
 #ifdef HAVE_UIO_ZEROCOPY
 			ASSERT(i_iov < iovcnt);
 			ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
@@ -743,8 +736,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 #endif
-		} else if (abuf == NULL && n >= max_blksz &&
-		    woff >= zp->z_size &&
+		} else if (n >= max_blksz && woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
@@ -771,7 +763,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		/*
 		 * Start a transaction.
 		 */
-		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
@@ -812,8 +804,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
-		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+		ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
+		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -873,7 +866,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
-		uid = KUID_TO_SUID(ip->i_uid);
+		uint32_t uid = KUID_TO_SUID(ip->i_uid);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
@@ -937,6 +930,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
+	int64_t nwritten = start_resid - uio->uio_resid;
+	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+	task_io_account_write(nwritten);
+
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c
index 5b6839dd4..91251f9e6 100644
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -242,7 +242,6 @@ zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
 
 	read = count - uio.uio_resid;
 	*ppos += read;
-	task_io_account_read(read);
 
 	return (read);
 }
@@ -339,7 +338,6 @@ zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
 
 	wrote = count - uio.uio_resid;
 	*ppos += wrote;
-	task_io_account_write(wrote);
 
 	return (wrote);
 }
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index ecba516fc..19bc1b18e 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -36,7 +36,7 @@
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /*
@@ -74,6 +74,7 @@
  * and zvol_release()->zvol_last_close() directly as well.
  */
 
+#include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
@@ -88,7 +89,9 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
+
 #include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
@@ -125,6 +128,7 @@ struct zvol_state {
 	dev_t			zv_dev;		/* device id */
 	struct gendisk		*zv_disk;	/* generic disk */
 	struct request_queue	*zv_queue;	/* request queue */
+	dataset_kstats_t	zv_kstat;	/* zvol kstats */
 	list_node_t		zv_next;	/* next zvol_state_t linkage */
 	uint64_t		zv_hash;	/* name hash */
 	struct hlist_node	zv_hlink;	/* hash link */
@@ -730,25 +734,25 @@ uio_from_bio(uio_t *uio, struct bio *bio)
 static void
 zvol_write(void *arg)
 {
+	int error = 0;
+
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	uio_t uio;
-	zvol_state_t *zv = zvr->zv;
-	uint64_t volsize = zv->zv_volsize;
-	boolean_t sync;
-	int error = 0;
-	unsigned long start_jif;
-
 	uio_from_bio(&uio, bio);
 
+	zvol_state_t *zv = zvr->zv;
 	ASSERT(zv && zv->zv_open_count > 0);
 
-	start_jif = jiffies;
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
-	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+	boolean_t sync =
+	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
+	uint64_t volsize = zv->zv_volsize;
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio.uio_loffset;
@@ -766,14 +770,20 @@ zvol_write(void *arg)
 			break;
 		}
 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
-		if (error == 0)
+		if (error == 0) {
 			zvol_log_write(zv, tx, off, bytes, sync);
+		}
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_range_unlock(zvr->rl);
+
+	int64_t nwritten = start_resid - uio.uio_resid;
+	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+	task_io_account_write(nwritten);
+
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
@@ -876,22 +886,22 @@ unlock:
 static void
 zvol_read(void *arg)
 {
+	int error = 0;
+
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	uio_t uio;
-	zvol_state_t *zv = zvr->zv;
-	uint64_t volsize = zv->zv_volsize;
-	int error = 0;
-	unsigned long start_jif;
-
 	uio_from_bio(&uio, bio);
 
+	zvol_state_t *zv = zvr->zv;
 	ASSERT(zv && zv->zv_open_count > 0);
 
-	start_jif = jiffies;
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
+	uint64_t volsize = zv->zv_volsize;
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 
@@ -909,6 +919,10 @@ zvol_read(void *arg)
 	}
 	zfs_range_unlock(zvr->rl);
 
+	int64_t nread = start_resid - uio.uio_resid;
+	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+	task_io_account_read(nread);
+
 	rw_exit(&zv->zv_suspend_lock);
 	blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
 	    start_jif);
@@ -1741,6 +1755,7 @@ zvol_free(void *arg)
 	ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
 
 	mutex_destroy(&zv->zv_state_lock);
+	dataset_kstats_destroy(&zv->zv_kstat);
 
 	kmem_free(zv, sizeof (zvol_state_t));
 }
@@ -1831,6 +1846,8 @@ zvol_create_minor_impl(const char *name)
 		else
 			zil_replay(os, zv, zvol_replay_vector);
 	}
+	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 
 	/*
 	 * When udev detects the addition of the device it will immediately
author	Serapheim Dimitropoulos <[email protected]>	2018-08-20 09:52:37 -0700
committer	Brian Behlendorf <[email protected]>	2018-08-20 09:52:37 -0700
commit	a448a2557ec4938ed6944c7766fe0b8e6e5f6456 (patch)
tree	1d622c6c40aeb9c34d233ad562b2920ab2ef651c /module/zfs
parent	fa84714abbb9316208bef7188009ee74204d532e (diff)