summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2013-10-25 15:22:34 -0700
committerBrian Behlendorf <[email protected]>2013-10-25 15:35:27 -0700
commit8eaf9f3543aa6843aa276010768cce8c0626e2d8 (patch)
tree9cd97bc1866b61b6539970b36aa02a3493401585
parent11cb9d773f48830cf3ff718861c070a8937c6a03 (diff)
parentd738d34da5b25b5e5daef966c29386468fd16263 (diff)
Merge branch 'kstat'
This branch updates several of the zfs kstats to take advantage of the improved raw kstat functionality. In addition, two new kstats and a script called dbufstat.py are introduced. Updated+New Kstats * dbufs - Stats for all dbufs in the dbuf_hash * <pool>/txgs - Stats for the last N txgs synced to disk * <pool>/reads - Stats for rhe last N reads issues by the ARC * <pool>/dmu_tx_assign - Histogram of tx assign times Signed-off-by: Brian Behlendorf <[email protected]>
-rw-r--r--cmd/Makefile.am2
-rwxr-xr-xcmd/arcstat/arcstat.py10
-rw-r--r--cmd/dbufstat/Makefile.am2
-rwxr-xr-xcmd/dbufstat/dbufstat.py535
-rw-r--r--configure.ac1
-rw-r--r--include/sys/arc.h31
-rw-r--r--include/sys/dbuf.h3
-rw-r--r--include/sys/dmu.h1
-rw-r--r--include/sys/dsl_pool.h20
-rw-r--r--include/sys/spa.h37
-rw-r--r--include/sys/spa_impl.h2
-rw-r--r--include/sys/zfs_context.h5
-rw-r--r--include/sys/zio.h6
-rw-r--r--lib/libspl/include/sys/kstat.h27
-rw-r--r--lib/libzpool/Makefile.am2
-rw-r--r--lib/libzpool/kernel.c8
-rw-r--r--module/zfs/Makefile.in2
-rw-r--r--module/zfs/arc.c107
-rw-r--r--module/zfs/dbuf.c4
-rw-r--r--module/zfs/dbuf_stats.c230
-rw-r--r--module/zfs/dmu.c18
-rw-r--r--module/zfs/dmu_traverse.c7
-rw-r--r--module/zfs/dmu_tx.c7
-rw-r--r--module/zfs/dsl_pool.c203
-rw-r--r--module/zfs/spa_misc.c4
-rw-r--r--module/zfs/spa_stats.c636
-rw-r--r--module/zfs/txg.c69
27 files changed, 1653 insertions, 326 deletions
diff --git a/cmd/Makefile.am b/cmd/Makefile.am
index bad1af63e..4c3e4bbb2 100644
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@@ -1,2 +1,2 @@
SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios
-SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat
+SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat
diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py
index e01dd8b4c..f703e33cd 100755
--- a/cmd/arcstat/arcstat.py
+++ b/cmd/arcstat/arcstat.py
@@ -106,7 +106,7 @@ opfile = None
sep = " " # Default separator is 2 spaces
version = "0.4"
l2exist = False
-cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval "
+cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval "
"[count]]\n")
cur = {}
d = {}
@@ -136,10 +136,10 @@ def usage():
sys.stderr.write("\t -s : Override default field separator with custom "
"character or string\n")
sys.stderr.write("\nExamples:\n")
- sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n")
- sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n")
- sys.stderr.write("\tarcstat -v\n")
- sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n")
+ sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n")
+ sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n")
+ sys.stderr.write("\tarcstat.py -v\n")
+ sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n")
sys.stderr.write("\n")
sys.exit(1)
diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am
new file mode 100644
index 000000000..0548b240c
--- /dev/null
+++ b/cmd/dbufstat/Makefile.am
@@ -0,0 +1,2 @@
+bin_SCRIPTS = dbufstat.py
+EXTRA_DIST = $(bin_SCRIPTS)
diff --git a/cmd/dbufstat/dbufstat.py b/cmd/dbufstat/dbufstat.py
new file mode 100755
index 000000000..80569b1d4
--- /dev/null
+++ b/cmd/dbufstat/dbufstat.py
@@ -0,0 +1,535 @@
+#!/usr/bin/python
+#
+# Print out statistics for all cached dmu buffers. This information
+# is available through the dbufs kstat and may be post-processed as
+# needed by the script.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (C) 2013 Lawrence Livermore National Security, LLC.
+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+#
+
+import sys
+import getopt
+import errno
+
+bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
+bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
+ "meta", "state", "dbholds", "list", "atype", "index", "flags", "count",
+ "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
+ "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize"]
+bincompat = ["cached", "direct", "indirect", "bonus", "spill"]
+
+dhdr = ["pool", "objset", "object", "dtype", "cached"]
+dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
+ "indirect", "bonus", "spill"]
+dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
+ "list", "atype", "index", "flags", "count", "asize", "access", "mru",
+ "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds"]
+
+thdr = ["pool", "objset", "dtype", "cached"]
+txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
+ "bonus", "spill"]
+tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
+ "dbholds", "list", "atype", "index", "flags", "count", "asize", "access",
+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp",
+ "aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds",
+ "blocks", "dsize"]
+
+cols = {
+ # hdr: [size, scale, description]
+ "pool": [15, -1, "pool name"],
+ "objset": [6, -1, "dataset identification number"],
+ "object": [10, -1, "object number"],
+ "level": [5, -1, "indirection level of buffer"],
+ "blkid": [8, -1, "block number of buffer"],
+ "offset": [12, 1024, "offset in object of buffer"],
+ "dbsize": [7, 1024, "size of buffer"],
+ "meta": [4, -1, "is this buffer metadata?"],
+ "state": [5, -1, "state of buffer (read, cached, etc)"],
+ "dbholds": [7, 1000, "number of holds on buffer"],
+ "list": [4, -1, "which ARC list contains this buffer"],
+ "atype": [7, -1, "ARC header type (data or metadata)"],
+ "index": [5, -1, "buffer's index into its ARC list"],
+ "flags": [8, -1, "ARC read flags"],
+ "count": [5, -1, "ARC data count"],
+ "asize": [7, 1024, "size of this ARC buffer"],
+ "access": [10, -1, "time this ARC buffer was last accessed"],
+ "mru": [5, 1000, "hits while on the ARC's MRU list"],
+ "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"],
+ "mfu": [5, 1000, "hits while on the ARC's MFU list"],
+ "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"],
+ "l2": [5, 1000, "hits while on the L2ARC"],
+ "l2_dattr": [8, -1, "L2ARC disk address/offset"],
+ "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"],
+ "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"],
+ "aholds": [6, 1000, "number of holds on this ARC buffer"],
+ "dtype": [27, -1, "dnode type"],
+ "btype": [27, -1, "bonus buffer type"],
+ "data_bs": [7, 1024, "data block size"],
+ "meta_bs": [7, 1024, "metadata block size"],
+ "bsize": [6, 1024, "bonus buffer size"],
+ "lvls": [6, -1, "number of indirection levels"],
+ "dholds": [6, 1000, "number of holds on dnode"],
+ "blocks": [8, 1000, "number of allocated blocks"],
+ "dsize": [12, 1024, "size of dnode"],
+ "cached": [6, 1024, "bytes cached for all blocks"],
+ "direct": [6, 1024, "bytes cached for direct blocks"],
+ "indirect": [8, 1024, "bytes cached for indirect blocks"],
+ "bonus": [5, 1024, "bytes cached for bonus buffer"],
+ "spill": [5, 1024, "bytes cached for spill block"],
+}
+
+hdr = None
+xhdr = None
+sep = " " # Default separator is 2 spaces
+cmd = ("Usage: dbufstat.py [-bdhrtvx] [-i file] [-f fields] [-o file] "
+ "[-s string]\n")
+raw = 0
+
+def print_incompat_helper(incompat):
+ cnt = 0
+ for key in sorted(incompat):
+ if cnt is 0:
+ sys.stderr.write("\t")
+ elif cnt > 8:
+ sys.stderr.write(",\n\t")
+ cnt = 0
+ else:
+ sys.stderr.write(", ")
+
+ sys.stderr.write("%s" % key)
+ cnt += 1
+
+ sys.stderr.write("\n\n")
+
+def detailed_usage():
+ sys.stderr.write("%s\n" % cmd)
+
+ sys.stderr.write("Field definitions incompatible with '-b' option:\n")
+ print_incompat_helper(bincompat)
+
+ sys.stderr.write("Field definitions incompatible with '-d' option:\n")
+ print_incompat_helper(dincompat)
+
+ sys.stderr.write("Field definitions incompatible with '-t' option:\n")
+ print_incompat_helper(tincompat)
+
+ sys.stderr.write("Field definitions are as follows:\n")
+ for key in sorted(cols.keys()):
+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+ sys.stderr.write("\n")
+
+ sys.exit(1)
+
+def usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("\t -b : Print table of information for each dbuf\n")
+ sys.stderr.write("\t -d : Print table of information for each dnode\n")
+ sys.stderr.write("\t -h : Print this help message\n")
+ sys.stderr.write("\t -r : Print raw values\n")
+ sys.stderr.write("\t -t : Print table of information for each dnode type\n")
+ sys.stderr.write("\t -v : List all possible field headers and definitions"
+ "\n")
+ sys.stderr.write("\t -x : Print extended stats\n")
+ sys.stderr.write("\t -i : Redirect input from the specified file\n")
+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+ sys.stderr.write("\t -o : Redirect output to the specified file\n")
+ sys.stderr.write("\t -s : Override default field separator with custom "
+ "character or string\n")
+ sys.stderr.write("\nExamples:\n")
+ sys.stderr.write("\tdbufstat.py -d -o /tmp/d.log\n")
+ sys.stderr.write("\tdbufstat.py -t -s \",\" -o /tmp/t.log\n")
+ sys.stderr.write("\tdbufstat.py -v\n")
+ sys.stderr.write("\tdbufstat.py -d -f pool,object,objset,dsize,cached\n")
+ sys.stderr.write("\n")
+
+ sys.exit(1)
+
+def prettynum(sz, scale, num=0):
+ global raw
+
+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+ index = 0
+ save = 0
+
+ if raw or scale == -1:
+ return "%*s" % (sz, num)
+
+ # Rounding error, return 0
+ elif num > 0 and num < 1:
+ num = 0
+
+ while num > scale and index < 5:
+ save = num
+ num = num / scale
+ index += 1
+
+ if index == 0:
+ return "%*d" % (sz, num)
+
+ if (save / scale) < 10:
+ return "%*.1f%s" % (sz - 1, num, suffix[index])
+ else:
+ return "%*d%s" % (sz - 1, num, suffix[index])
+
+def print_values(v):
+ global hdr
+ global sep
+
+ try:
+ for col in hdr:
+ sys.stdout.write("%s%s" % (
+ prettynum(cols[col][0], cols[col][1], v[col]), sep))
+ sys.stdout.write("\n")
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+
+def print_header():
+ global hdr
+ global sep
+
+ try:
+ for col in hdr:
+ sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
+ sys.stdout.write("\n")
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+
+def get_typestring(t):
+ type_strings = ["DMU_OT_NONE",
+ # general:
+ "DMU_OT_OBJECT_DIRECTORY", "DMU_OT_OBJECT_ARRAY",
+ "DMU_OT_PACKED_NVLIST", "DMU_OT_PACKED_NVLIST_SIZE",
+ "DMU_OT_BPOBJ", "DMU_OT_BPOBJ_HDR",
+ # spa:
+ "DMU_OT_SPACE_MAP_HEADER", "DMU_OT_SPACE_MAP",
+ # zil:
+ "DMU_OT_INTENT_LOG",
+ # dmu:
+ "DMU_OT_DNODE", "DMU_OT_OBJSET",
+ # dsl:
+ "DMU_OT_DSL_DIR", "DMU_OT_DSL_DIR_CHILD_MAP",
+ "DMU_OT_DSL_DS_SNAP_MAP", "DMU_OT_DSL_PROPS",
+ "DMU_OT_DSL_DATASET",
+ # zpl:
+ "DMU_OT_ZNODE", "DMU_OT_OLDACL", "DMU_OT_PLAIN_FILE_CONTENTS",
+ "DMU_OT_DIRECTORY_CONTENTS", "DMU_OT_MASTER_NODE",
+ "DMU_OT_UNLINKED_SET",
+ # zvol:
+ "DMU_OT_ZVOL", "DMU_OT_ZVOL_PROP",
+ # other; for testing only!
+ "DMU_OT_PLAIN_OTHER", "DMU_OT_UINT64_OTHER", "DMU_OT_ZAP_OTHER",
+ # new object types:
+ "DMU_OT_ERROR_LOG", "DMU_OT_SPA_HISTORY",
+ "DMU_OT_SPA_HISTORY_OFFSETS", "DMU_OT_POOL_PROPS",
+ "DMU_OT_DSL_PERMS", "DMU_OT_ACL", "DMU_OT_SYSACL",
+ "DMU_OT_FUID", "DMU_OT_FUID_SIZE", "DMU_OT_NEXT_CLONES",
+ "DMU_OT_SCAN_QUEUE", "DMU_OT_USERGROUP_USED",
+ "DMU_OT_USERGROUP_QUOTA", "DMU_OT_USERREFS", "DMU_OT_DDT_ZAP",
+ "DMU_OT_DDT_STATS", "DMU_OT_SA", "DMU_OT_SA_MASTER_NODE",
+ "DMU_OT_SA_ATTR_REGISTRATION", "DMU_OT_SA_ATTR_LAYOUTS",
+ "DMU_OT_SCAN_XLATE", "DMU_OT_DEDUP", "DMU_OT_DEADLIST",
+ "DMU_OT_DEADLIST_HDR", "DMU_OT_DSL_CLONES",
+ "DMU_OT_BPOBJ_SUBOBJ"]
+
+ # If "-rr" option is used, don't convert to string representation
+ if raw > 1:
+ return "%i" % t
+
+ try:
+ return type_strings[t];
+ except IndexError:
+ return "%i" % t
+
+def get_compstring(c):
+ comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON",
+ "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB",
+ "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1",
+ "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3",
+ "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5",
+ "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7",
+ "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9",
+ "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4",
+ "ZIO_COMPRESS_FUNCTION"]
+
+ # If "-rr" option is used, don't convert to string representation
+ if raw > 1:
+ return "%i" % c
+
+ try:
+ return comp_strings[c];
+ except IndexError:
+ return "%i" % c
+
+def parse_line(line, labels):
+ global hdr
+
+ new = dict()
+ val = None
+ for col in hdr:
+ # These are "special" fields computed in the update_dict
+ # function, prevent KeyError exception on labels[col] for these.
+ if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']:
+ val = line[labels[col]]
+
+ if col in ['pool', 'flags']:
+ new[col] = str(val)
+ elif col in ['dtype', 'btype']:
+ new[col] = get_typestring(int(val))
+ elif col in ['l2_comp']:
+ new[col] = get_compstring(int(val))
+ else:
+ new[col] = int(val)
+
+ return new
+
+def update_dict(d, k, line, labels):
+ pool = line[labels['pool']]
+ objset = line[labels['objset']]
+ key = line[labels[k]]
+
+ dbsize = int(line[labels['dbsize']])
+ blkid = int(line[labels['blkid']])
+ level = int(line[labels['level']])
+
+ if pool not in d:
+ d[pool] = dict()
+
+ if objset not in d[pool]:
+ d[pool][objset] = dict()
+
+ if key not in d[pool][objset]:
+ d[pool][objset][key] = parse_line(line, labels)
+ d[pool][objset][key]['bonus'] = 0
+ d[pool][objset][key]['cached'] = 0
+ d[pool][objset][key]['direct'] = 0
+ d[pool][objset][key]['indirect'] = 0
+ d[pool][objset][key]['spill'] = 0
+
+ d[pool][objset][key]['cached'] += dbsize
+
+ if blkid == -1:
+ d[pool][objset][key]['bonus'] += dbsize
+ elif blkid == -2:
+ d[pool][objset][key]['spill'] += dbsize
+ else:
+ if level == 0:
+ d[pool][objset][key]['direct'] += dbsize
+ else:
+ d[pool][objset][key]['indirect'] += dbsize
+
+ return d
+
+def print_dict(d):
+ print_header()
+ for pool in d.iterkeys():
+ for objset in d[pool].iterkeys():
+ for v in d[pool][objset].itervalues():
+ print_values(v)
+
+def dnodes_build_dict(filehandle):
+ labels = dict()
+ dnodes = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(0, 2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ update_dict(dnodes, 'object', line.split(), labels)
+
+ return dnodes
+
+def types_build_dict(filehandle):
+ labels = dict()
+ types = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(0, 2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ update_dict(types, 'dtype', line.split(), labels)
+
+ return types
+
+def buffers_print_all(filehandle):
+ labels = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(0, 2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ print_header()
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ print_values(parse_line(line.split(), labels))
+
+def main():
+ global hdr
+ global sep
+ global raw
+
+ desired_cols = None
+ bflag = False
+ dflag = False
+ hflag = False
+ ifile = None
+ ofile = None
+ tflag = False
+ vflag = False
+ xflag = False
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "bdf:hi:o:rs:tvx",
+ [
+ "buffers",
+ "dnodes",
+ "columns",
+ "help",
+ "infile",
+ "outfile",
+ "seperator",
+ "types",
+ "verbose",
+ "extended"
+ ]
+ )
+ except getopt.error:
+ usage()
+
+ for opt, arg in opts:
+ if opt in ('-b', '--buffers'):
+ bflag = True
+ if opt in ('-d', '--dnodes'):
+ dflag = True
+ if opt in ('-f', '--columns'):
+ desired_cols = arg
+ if opt in ('-h', '--help'):
+ hflag = True
+ if opt in ('-i', '--infile'):
+ ifile = arg
+ if opt in ('-o', '--outfile'):
+ ofile = arg
+ if opt in ('-r', '--raw'):
+ raw += 1
+ if opt in ('-s', '--seperator'):
+ sep = arg
+ if opt in ('-t', '--types'):
+ tflag = True
+ if opt in ('-v', '--verbose'):
+ vflag = True
+ if opt in ('-x', '--extended'):
+ xflag = True
+
+ if hflag or (xflag and desired_cols):
+ usage()
+
+ if vflag:
+ detailed_usage()
+
+ # Ensure at most only one of b, d, or t flags are set
+ if (bflag and dflag) or (bflag and tflag) or (dflag and tflag):
+ usage()
+
+ if bflag:
+ hdr = bxhdr if xflag else bhdr
+ elif tflag:
+ hdr = txhdr if xflag else thdr
+ else: # Even if dflag is False, it's the default if none set
+ dflag = True
+ hdr = dxhdr if xflag else dhdr
+
+ if desired_cols:
+ hdr = desired_cols.split(",")
+
+ invalid = []
+ incompat = []
+ for ele in hdr:
+ if ele not in cols:
+ invalid.append(ele)
+ elif ((bflag and bincompat and ele in bincompat) or
+ (dflag and dincompat and ele in dincompat) or
+ (tflag and tincompat and ele in tincompat)):
+ incompat.append(ele)
+
+ if len(invalid) > 0:
+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+ usage()
+
+ if len(incompat) > 0:
+ sys.stderr.write("Incompatible field specified! -- %s\n" % incompat)
+ usage()
+
+ if ofile:
+ try:
+ tmp = open(ofile, "w")
+ sys.stdout = tmp
+
+ except:
+ sys.stderr.write("Cannot open %s for writing\n", ofile)
+ sys.exit(1)
+
+ if not ifile:
+ ifile = '/proc/spl/kstat/zfs/dbufs'
+
+ if ifile is not "-":
+ try:
+ tmp = open(ifile, "r")
+ sys.stdin = tmp
+ except:
+ sys.stderr.write("Cannot open %s for reading\n" % ifile)
+ sys.exit(1)
+
+ if bflag:
+ buffers_print_all(sys.stdin)
+
+ if dflag:
+ print_dict(dnodes_build_dict(sys.stdin))
+
+ if tflag:
+ print_dict(types_build_dict(sys.stdin))
+
+if __name__ == '__main__':
+ main()
diff --git a/configure.ac b/configure.ac
index 08b807e3f..7c4809426 100644
--- a/configure.ac
+++ b/configure.ac
@@ -104,6 +104,7 @@ AC_CONFIG_FILES([
cmd/zvol_id/Makefile
cmd/vdev_id/Makefile
cmd/arcstat/Makefile
+ cmd/dbufstat/Makefile
module/Makefile
module/avl/Makefile
module/nvpair/Makefile
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 8c10d947c..efafb551a 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -92,6 +92,36 @@ typedef enum arc_space_type {
ARC_SPACE_NUMTYPES
} arc_space_type_t;
+typedef enum arc_state_type {
+ ARC_STATE_ANON,
+ ARC_STATE_MRU,
+ ARC_STATE_MRU_GHOST,
+ ARC_STATE_MFU,
+ ARC_STATE_MFU_GHOST,
+ ARC_STATE_L2C_ONLY,
+ ARC_STATE_NUMTYPES
+} arc_state_type_t;
+
+typedef struct arc_buf_info {
+ arc_state_type_t abi_state_type;
+ arc_buf_contents_t abi_state_contents;
+ uint64_t abi_state_index;
+ uint32_t abi_flags;
+ uint32_t abi_datacnt;
+ uint64_t abi_size;
+ uint64_t abi_spa;
+ uint64_t abi_access;
+ uint32_t abi_mru_hits;
+ uint32_t abi_mru_ghost_hits;
+ uint32_t abi_mfu_hits;
+ uint32_t abi_mfu_ghost_hits;
+ uint32_t abi_l2arc_hits;
+ uint32_t abi_holds;
+ uint64_t abi_l2arc_dattr;
+ uint64_t abi_l2arc_asize;
+ enum zio_compress abi_l2arc_compress;
+} arc_buf_info_t;
+
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
@@ -101,6 +131,7 @@ void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 8cd1fde01..85e967d0f 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -282,6 +282,9 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+void dbuf_stats_init(dbuf_hash_table_t *hash);
+void dbuf_stats_destroy(void);
+
#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index b0db7604d..fd1460d5c 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -660,6 +660,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
* If doi is NULL, just indicates whether the object exists.
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 51b588e6a..25abd9c03 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -71,13 +71,6 @@ typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
} zfs_all_blkstats_t;
-typedef struct txg_history {
- kstat_txg_t th_kstat;
- vdev_stat_t th_vs1;
- vdev_stat_t th_vs2;
- kmutex_t th_lock;
- list_node_t th_link;
-} txg_history_t;
typedef struct dsl_pool {
/* Immutable */
@@ -89,8 +82,6 @@ typedef struct dsl_pool {
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
- kstat_t *dp_txg_kstat;
- kstat_t *dp_tx_assign_kstat;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
@@ -111,11 +102,6 @@ typedef struct dsl_pool {
uint64_t dp_mos_used_delta;
uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta;
- uint64_t dp_txg_history_size;
- list_t dp_txg_history;
- uint64_t dp_tx_assign_size;
- kstat_named_t *dp_tx_assign_buckets;
-
/* Has its own locking */
tx_state_t dp_tx;
@@ -171,12 +157,6 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
void dsl_pool_rele(dsl_pool_t *dp, void *tag);
-void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs);
-
-txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg);
-txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_txg_history_put(txg_history_t *th);
-
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 401ae8343..cb3ce11bc 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -51,6 +51,8 @@ typedef struct zilog zilog_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
+typedef struct zbookmark zbookmark_t;
+
struct dsl_pool;
struct dsl_dataset;
@@ -534,6 +536,41 @@ extern boolean_t spa_refcount_zero(spa_t *spa);
#define SCL_ALL ((1 << SCL_LOCKS) - 1)
#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO)
+/* Historical pool statistics */
+typedef struct spa_stats_history {
+ kmutex_t lock;
+ uint64_t count;
+ uint64_t size;
+ kstat_t *kstat;
+ void *private;
+ list_t list;
+} spa_stats_history_t;
+
+typedef struct spa_stats {
+ spa_stats_history_t read_history;
+ spa_stats_history_t txg_history;
+ spa_stats_history_t tx_assign_histogram;
+} spa_stats_t;
+
+typedef enum txg_state {
+ TXG_STATE_BIRTH = 0,
+ TXG_STATE_OPEN = 1,
+ TXG_STATE_QUIESCED = 2,
+ TXG_STATE_SYNCED = 3,
+ TXG_STATE_COMMITTED = 4,
+} txg_state_t;
+
+extern void spa_stats_init(spa_t *spa);
+extern void spa_stats_destroy(spa_t *spa);
+extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb,
+ uint32_t aflags);
+extern void spa_txg_history_add(spa_t *spa, uint64_t txg);
+extern int spa_txg_history_set(spa_t *spa, uint64_t txg,
+ txg_state_t completed_state, hrtime_t completed_time);
+extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved);
+extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
+
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 47dfe432e..107d14a76 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -236,6 +236,8 @@ struct spa {
uint64_t spa_deadman_calls; /* number of deadman calls */
uint64_t spa_sync_starttime; /* starting time fo spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
+ spa_stats_t spa_stats; /* assorted spa statistics */
+
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index a126c058e..dfd11d9f1 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -210,6 +210,7 @@ typedef struct kthread {
} kthread_t;
#define curthread zk_thread_current()
+#define getcomm() "unknown"
#define thread_exit zk_thread_exit
#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
zk_thread_create(stk, stksize, (thread_func_t)func, arg, \
@@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int,
char *, char *, uchar_t, ulong_t, uchar_t);
extern void kstat_install(kstat_t *);
extern void kstat_delete(kstat_t *);
+extern void kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index));
/*
* Kernel memory
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 189966bef..f5a128e0b 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES];
* Therefore it must not change size or alignment between 32/64 bit
* compilation options.
*/
-typedef struct zbookmark {
+struct zbookmark {
uint64_t zb_objset;
uint64_t zb_object;
int64_t zb_level;
uint64_t zb_blkid;
-} zbookmark_t;
+ char * zb_func;
+};
#define SET_BOOKMARK(zb, objset, object, level, blkid) \
{ \
@@ -269,6 +270,7 @@ typedef struct zbookmark {
(zb)->zb_object = object; \
(zb)->zb_level = level; \
(zb)->zb_blkid = blkid; \
+ (zb)->zb_func = FTAG; \
}
#define ZB_DESTROYED_OBJSET (-1ULL)
diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h
index 6bd2ec878..fcd3ed98b 100644
--- a/lib/libspl/include/sys/kstat.h
+++ b/lib/libspl/include/sys/kstat.h
@@ -228,10 +228,8 @@ typedef struct kstat32 {
/* ks_ndata == 1 */
#define KSTAT_TYPE_TIMER 4 /* event timer */
/* ks_ndata >= 1 */
-#define KSTAT_TYPE_TXG 5 /* txg statistics */
- /* ks_ndata >= 0 */
-#define KSTAT_NUM_TYPES 6
+#define KSTAT_NUM_TYPES 5
/*
* kstat class
@@ -700,29 +698,6 @@ typedef struct kstat_timer {
#define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data)
-/*
- * TXG statistics - bytes read/written and iops performed
- */
-typedef enum kstat_txg_state {
- TXG_STATE_OPEN = 1,
- TXG_STATE_QUIESCING = 2,
- TXG_STATE_SYNCING = 3,
- TXG_STATE_COMMITTED = 4,
-} kstat_txg_state_t;
-
-typedef struct kstat_txg {
- u_longlong_t txg; /* txg id */
- kstat_txg_state_t state; /* txg state */
- hrtime_t birth; /* birth time stamp */
- u_longlong_t nread; /* number of bytes read */
- u_longlong_t nwritten; /* number of bytes written */
- uint_t reads; /* number of read operations */
- uint_t writes; /* number of write operations */
- hrtime_t open_time; /* open time */
- hrtime_t quiesce_time; /* quiesce time */
- hrtime_t sync_time; /* sync time */
-} kstat_txg_t;
-
#if defined(_KERNEL)
#include <sys/t_lock.h>
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index da81659d2..e4189a378 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -25,6 +25,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/bpobj.c \
$(top_srcdir)/module/zfs/bptree.c \
$(top_srcdir)/module/zfs/dbuf.c \
+ $(top_srcdir)/module/zfs/dbuf_stats.c \
$(top_srcdir)/module/zfs/ddt.c \
$(top_srcdir)/module/zfs/ddt_zap.c \
$(top_srcdir)/module/zfs/dmu.c \
@@ -62,6 +63,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/spa_errlog.c \
$(top_srcdir)/module/zfs/spa_history.c \
$(top_srcdir)/module/zfs/spa_misc.c \
+ $(top_srcdir)/module/zfs/spa_stats.c \
$(top_srcdir)/module/zfs/space_map.c \
$(top_srcdir)/module/zfs/txg.c \
$(top_srcdir)/module/zfs/uberblock.c \
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index e4d645cf7..57a3739e2 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -240,6 +240,14 @@ void
kstat_delete(kstat_t *ksp)
{}
+/*ARGSUSED*/
+void
+kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{}
+
/*
* =========================================================================
* mutexes
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index e71228454..5552436ad 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o
$(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o
$(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o
@@ -44,6 +45,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 1298c5b91..9098988fd 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -232,6 +232,7 @@ typedef struct arc_state {
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
+ arc_state_type_t arcs_state;
} arc_state_t;
/* The 6 states: */
@@ -534,6 +535,11 @@ struct arc_buf_hdr {
/* updated atomically */
clock_t b_arc_access;
+ uint32_t b_mru_hits;
+ uint32_t b_mru_ghost_hits;
+ uint32_t b_mfu_hits;
+ uint32_t b_mfu_ghost_hits;
+ uint32_t b_l2_hits;
/* self protecting */
refcount_t b_refcnt;
@@ -709,7 +715,8 @@ struct l2arc_buf_hdr {
/* compression applied to buffer data */
enum zio_compress b_compress;
/* real alloc'd buffer size depending on b_compress applied */
- int b_asize;
+ uint32_t b_asize;
+ uint32_t b_hits;
/* temporary buffer holder for in-flight compressed data */
void *b_tmp_cdata;
};
@@ -1138,6 +1145,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
}
/*
+ * Returns detailed information about a specific arc buffer. When the
+ * state_index argument is set the function will calculate the arc header
+ * list position for its arc state. Since this requires a linear traversal
+ * callers are strongly encourage not to do this. However, it can be helpful
+ * for targeted analysis so the functionality is provided.
+ */
+void
+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
+{
+ arc_buf_hdr_t *hdr = ab->b_hdr;
+ arc_state_t *state = hdr->b_state;
+
+ memset(abi, 0, sizeof(arc_buf_info_t));
+ abi->abi_flags = hdr->b_flags;
+ abi->abi_datacnt = hdr->b_datacnt;
+ abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
+ abi->abi_state_contents = hdr->b_type;
+ abi->abi_state_index = -1;
+ abi->abi_size = hdr->b_size;
+ abi->abi_access = hdr->b_arc_access;
+ abi->abi_mru_hits = hdr->b_mru_hits;
+ abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
+ abi->abi_mfu_hits = hdr->b_mfu_hits;
+ abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
+ abi->abi_holds = refcount_count(&hdr->b_refcnt);
+
+ if (hdr->b_l2hdr) {
+ abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
+ abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
+ abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
+ abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
+ }
+
+ if (state && state_index && list_link_active(&hdr->b_arc_node)) {
+ list_t *list = &state->arcs_list[hdr->b_type];
+ arc_buf_hdr_t *h;
+
+ mutex_enter(&state->arcs_mtx);
+ for (h = list_head(list); h != NULL; h = list_next(list, h)) {
+ abi->abi_state_index++;
+ if (h == hdr)
+ break;
+ }
+ mutex_exit(&state->arcs_mtx);
+ }
+}
+
+/*
* Move the supplied buffer to the indicated state. The mutex
* for the buffer must be held by the caller.
*/
@@ -1298,6 +1353,11 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
hdr->b_spa = spa_load_guid(spa);
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
+ hdr->b_mru_hits = 0;
+ hdr->b_mru_ghost_hits = 0;
+ hdr->b_mfu_hits = 0;
+ hdr->b_mfu_ghost_hits = 0;
+ hdr->b_l2_hits = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
@@ -2670,6 +2730,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
ASSERT(list_link_active(&buf->b_arc_node));
} else {
buf->b_flags &= ~ARC_PREFETCH;
+ atomic_inc_32(&buf->b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
}
buf->b_arc_access = now;
@@ -2691,6 +2752,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
}
+ atomic_inc_32(&buf->b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
} else if (buf->b_state == arc_mru_ghost) {
arc_state_t *new_state;
@@ -2713,6 +2775,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
buf->b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, buf, hash_lock);
+ atomic_inc_32(&buf->b_mru_ghost_hits);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
} else if (buf->b_state == arc_mfu) {
/*
@@ -2728,6 +2791,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
ASSERT(refcount_count(&buf->b_refcnt) == 0);
ASSERT(list_link_active(&buf->b_arc_node));
}
+ atomic_inc_32(&buf->b_mfu_hits);
ARCSTAT_BUMP(arcstat_mfu_hits);
buf->b_arc_access = ddi_get_lbolt();
} else if (buf->b_state == arc_mfu_ghost) {
@@ -2751,6 +2815,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(new_state, buf, hash_lock);
+ atomic_inc_32(&buf->b_mfu_ghost_hits);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
} else if (buf->b_state == arc_l2c_only) {
/*
@@ -2943,6 +3008,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
kmutex_t *hash_lock;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
+ int rc = 0;
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@@ -2976,10 +3042,10 @@ top:
hdr->b_acb = acb;
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
- return (0);
+ goto out;
}
mutex_exit(hash_lock);
- return (0);
+ goto out;
}
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -3133,6 +3199,7 @@ top:
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
+ atomic_inc_32(&hdr->b_l2hdr->b_hits);
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
KM_PUSHPAGE);
@@ -3174,12 +3241,12 @@ top:
if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio);
- return (0);
+ goto out;
}
ASSERT(*arc_flags & ARC_WAIT);
if (zio_wait(rzio) == 0)
- return (0);
+ goto out;
/* l2arc read error; goto zio_read() */
} else {
@@ -3203,13 +3270,18 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
- if (*arc_flags & ARC_WAIT)
- return (zio_wait(rzio));
+ if (*arc_flags & ARC_WAIT) {
+ rc = zio_wait(rzio);
+ goto out;
+ }
ASSERT(*arc_flags & ARC_NOWAIT);
zio_nowait(rzio);
}
- return (0);
+
+out:
+ spa_read_history_add(spa, zb, *arc_flags);
+ return (rc);
}
arc_prune_t *
@@ -3463,6 +3535,11 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
+ nhdr->b_mru_hits = 0;
+ nhdr->b_mru_ghost_hits = 0;
+ nhdr->b_mfu_hits = 0;
+ nhdr->b_mfu_ghost_hits = 0;
+ nhdr->b_l2_hits = 0;
nhdr->b_flags = flags & ARC_L2_WRITING;
nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
@@ -3479,6 +3556,11 @@ arc_release(arc_buf_t *buf, void *tag)
if (hdr->b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
+ hdr->b_mru_hits = 0;
+ hdr->b_mru_ghost_hits = 0;
+ hdr->b_mfu_hits = 0;
+ hdr->b_mfu_ghost_hits = 0;
+ hdr->b_l2_hits = 0;
if (hash_lock)
mutex_exit(hash_lock);
@@ -3896,6 +3978,13 @@ arc_init(void)
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ arc_anon->arcs_state = ARC_STATE_ANON;
+ arc_mru->arcs_state = ARC_STATE_MRU;
+ arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
+ arc_mfu->arcs_state = ARC_STATE_MFU;
+ arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
+ arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+
buf_init();
arc_thread_exit = 0;
@@ -4779,6 +4868,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
l2hdr->b_compress = ZIO_COMPRESS_OFF;
l2hdr->b_asize = ab->b_size;
l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+ l2hdr->b_hits = 0;
buf_sz = ab->b_size;
ab->b_l2hdr = l2hdr;
@@ -5311,6 +5401,7 @@ l2arc_stop(void)
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(arc_read);
EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_buf_info);
EXPORT_SYMBOL(arc_getbuf_func);
EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index d655d6621..44e9419b7 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -317,6 +317,8 @@ retry:
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+ dbuf_stats_init(h);
}
void
@@ -325,6 +327,8 @@ dbuf_fini(void)
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
+ dbuf_stats_destroy();
+
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
#if defined(_KERNEL) && defined(HAVE_SPL)
diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c
new file mode 100644
index 000000000..ef760eaba
--- /dev/null
+++ b/module/zfs/dbuf_stats.c
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+
+/*
+ * Calculate the index of the arc header for the state, disabled by default.
+ */
+int zfs_dbuf_state_index = 0;
+
+/*
+ * ==========================================================================
+ * Dbuf Hash Read Routines
+ * ==========================================================================
+ */
+typedef struct dbuf_stats_t {
+ kmutex_t lock;
+ kstat_t *kstat;
+ dbuf_hash_table_t *hash;
+ int idx;
+} dbuf_stats_t;
+
+static dbuf_stats_t dbuf_stats_hash_table;
+
+static int
+dbuf_stats_hash_table_headers(char *buf, size_t size)
+{
+ size = snprintf(buf, size - 1,
+ "%-88s | %-124s | %s\n"
+ "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
+ "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
+ "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
+ "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
+ "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
+ "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
+ "atype", "index", "flags", "count", "asize", "access", "mru", "gmru",
+ "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds",
+ "dtype", "btype", "data_bs", "meta_bs", "bsize",
+ "lvls", "dholds", "blocks", "dsize");
+ buf[size] = '\0';
+
+ return (0);
+}
+
+int
+__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
+{
+ arc_buf_info_t abi = { 0 };
+ dmu_object_info_t doi = { 0 };
+ dnode_t *dn = DB_DNODE(db);
+
+ if (db->db_buf)
+ arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
+
+ if (dn)
+ __dmu_object_info_from_dnode(dn, &doi);
+
+ size = snprintf(buf, size - 1,
+ "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
+ "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
+ "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
+ "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
+ /* dmu_buf_impl_t */
+ spa_name(dn->dn_objset->os_spa),
+ (u_longlong_t)dmu_objset_id(db->db_objset),
+ (longlong_t)db->db.db_object,
+ (longlong_t)db->db_level,
+ (longlong_t)db->db_blkid,
+ (u_longlong_t)db->db.db_offset,
+ (u_longlong_t)db->db.db_size,
+ !!dbuf_is_metadata(db),
+ db->db_state,
+ (ulong_t)refcount_count(&db->db_holds),
+ /* arc_buf_info_t */
+ abi.abi_state_type,
+ abi.abi_state_contents,
+ (longlong_t)abi.abi_state_index,
+ abi.abi_flags,
+ (ulong_t)abi.abi_datacnt,
+ (u_longlong_t)abi.abi_size,
+ (u_longlong_t)abi.abi_access,
+ (ulong_t)abi.abi_mru_hits,
+ (ulong_t)abi.abi_mru_ghost_hits,
+ (ulong_t)abi.abi_mfu_hits,
+ (ulong_t)abi.abi_mfu_ghost_hits,
+ (ulong_t)abi.abi_l2arc_hits,
+ (u_longlong_t)abi.abi_l2arc_dattr,
+ (u_longlong_t)abi.abi_l2arc_asize,
+ abi.abi_l2arc_compress,
+ (ulong_t)abi.abi_holds,
+ /* dmu_object_info_t */
+ doi.doi_type,
+ doi.doi_bonus_type,
+ (ulong_t)doi.doi_data_block_size,
+ (ulong_t)doi.doi_metadata_block_size,
+ (u_longlong_t)doi.doi_bonus_size,
+ (ulong_t)doi.doi_indirection,
+ (ulong_t)refcount_count(&dn->dn_holds),
+ (u_longlong_t)doi.doi_fill_count,
+ (u_longlong_t)doi.doi_max_offset);
+ buf[size] = '\0';
+
+ return (size);
+}
+
+static int
+dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
+{
+ dbuf_stats_t *dsh = (dbuf_stats_t *)data;
+ dbuf_hash_table_t *h = dsh->hash;
+ dmu_buf_impl_t *db;
+ int length, error = 0;
+
+ ASSERT3S(dsh->idx, >=, 0);
+ ASSERT3S(dsh->idx, <=, h->hash_table_mask);
+ memset(buf, 0, size);
+
+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+ for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
+ /*
+ * Returning ENOMEM will cause the data and header functions
+ * to be called with a larger scratch buffers.
+ */
+ if (size < 512) {
+ error = ENOMEM;
+ break;
+ }
+
+ mutex_enter(&db->db_mtx);
+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+
+ length = __dbuf_stats_hash_table_data(buf, size, db);
+ buf += length;
+ size -= length;
+
+ mutex_exit(&db->db_mtx);
+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+
+ return (error);
+}
+
+static void *
+dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n)
+{
+ dbuf_stats_t *dsh = ksp->ks_private;
+
+ ASSERT(MUTEX_HELD(&dsh->lock));
+
+ if (n <= dsh->hash->hash_table_mask) {
+ dsh->idx = n;
+ return (dsh);
+ }
+
+ return (NULL);
+}
+
+static void
+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
+{
+ dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+ kstat_t *ksp;
+
+ mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
+ dsh->hash = hash;
+
+ ksp = kstat_create("zfs", 0, "dbufs", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ dsh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &dsh->lock;
+ ksp->ks_ndata = UINT32_MAX;
+ ksp->ks_private = dsh;
+ kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
+ dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
+ kstat_install(ksp);
+ }
+}
+
+static void
+dbuf_stats_hash_table_destroy(void)
+{
+ dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+ kstat_t *ksp;
+
+ ksp = dsh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&dsh->lock);
+}
+
+void
+dbuf_stats_init(dbuf_hash_table_t *hash)
+{
+ dbuf_stats_hash_table_init(hash);
+}
+
+void
+dbuf_stats_destroy(void)
+{
+ dbuf_stats_hash_table_destroy();
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_dbuf_state_index, int, 0644);
+MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index");
+#endif
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index cbf4790b1..4ec9cb46a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1815,16 +1815,11 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
}
void
-dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
- dnode_phys_t *dnp;
+ dnode_phys_t *dnp = dn->dn_phys;
int i;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- mutex_enter(&dn->dn_mtx);
-
- dnp = dn->dn_phys;
-
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
@@ -1839,6 +1834,15 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_fill_count = 0;
for (i = 0; i < dnp->dn_nblkptr; i++)
doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ __dmu_object_info_from_dnode(dn, doi);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 32b3e50fc..1827ffc74 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
+ SET_BOOKMARK(czb, td->td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
uint32_t flags = ARC_WAIT;
@@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
err = arc_read(NULL, td->td_spa, rootbp,
arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
if (err != 0)
return (err);
@@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td, TQ_NOQUEUE))
pd->pd_exited = B_TRUE;
- SET_BOOKMARK(czb, td->td_objset,
- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(td, NULL, rootbp, czb);
mutex_enter(&pd->pd_mtx);
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 3e46a02f8..7b6223956 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1077,7 +1077,7 @@ dmu_tx_unassign(dmu_tx_t *tx)
int
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
{
- hrtime_t before, after;
+ hrtime_t before;
int err;
ASSERT(tx->tx_txg == 0);
@@ -1100,10 +1100,7 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
txg_rele_to_quiesce(&tx->tx_txgh);
- after = gethrtime();
-
- dsl_pool_tx_assign_add_usecs(tx->tx_pool,
- (after - before) / NSEC_PER_USEC);
+ spa_tx_assign_add_nsecs(tx->tx_pool->dp_spa, gethrtime() - before);
return (0);
}
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index da4a9df85..ffef9be75 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -48,7 +48,6 @@
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
-int zfs_txg_history = 60; /* statistics for the last N txgs */
unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
unsigned long zfs_write_limit_max = 0; /* max data payload per txg */
@@ -59,200 +58,6 @@ kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
-static void
-dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata)
-{
- kstat_named_t *ks;
- char name[KSTAT_STRLEN];
- int i, data_size = ndata * sizeof(kstat_named_t);
-
- (void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s",
- spa_name(dp->dp_spa));
-
- dp->dp_tx_assign_size = ndata;
-
- if (data_size)
- dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP);
- else
- dp->dp_tx_assign_buckets = NULL;
-
- for (i = 0; i < dp->dp_tx_assign_size; i++) {
- ks = &dp->dp_tx_assign_buckets[i];
- ks->data_type = KSTAT_DATA_UINT64;
- ks->value.ui64 = 0;
- (void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i);
- }
-
- dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc",
- KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
-
- if (dp->dp_tx_assign_kstat) {
- dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets;
- dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size;
- dp->dp_tx_assign_kstat->ks_data_size = data_size;
- kstat_install(dp->dp_tx_assign_kstat);
- }
-}
-
-static void
-dsl_pool_tx_assign_destroy(dsl_pool_t *dp)
-{
- if (dp->dp_tx_assign_buckets)
- kmem_free(dp->dp_tx_assign_buckets,
- dp->dp_tx_assign_size * sizeof(kstat_named_t));
-
- if (dp->dp_tx_assign_kstat)
- kstat_delete(dp->dp_tx_assign_kstat);
-}
-
-void
-dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs)
-{
- uint64_t idx = 0;
-
- while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1))
- idx++;
-
- atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64);
-}
-
-static int
-dsl_pool_txg_history_update(kstat_t *ksp, int rw)
-{
- dsl_pool_t *dp = ksp->ks_private;
- txg_history_t *th;
- int i = 0;
-
- if (rw == KSTAT_WRITE)
- return (EACCES);
-
- if (ksp->ks_data)
- kmem_free(ksp->ks_data, ksp->ks_data_size);
-
- mutex_enter(&dp->dp_lock);
-
- ksp->ks_ndata = dp->dp_txg_history_size;
- ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t);
- if (ksp->ks_data_size > 0)
- ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE);
-
- /* Traversed oldest to youngest for the most readable kstat output */
- for (th = list_tail(&dp->dp_txg_history); th != NULL;
- th = list_prev(&dp->dp_txg_history, th)) {
- mutex_enter(&th->th_lock);
- ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size);
- memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t));
- i += sizeof(kstat_txg_t);
- mutex_exit(&th->th_lock);
- }
-
- mutex_exit(&dp->dp_lock);
-
- return (0);
-}
-
-static void
-dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg)
-{
- char name[KSTAT_STRLEN];
-
- list_create(&dp->dp_txg_history, sizeof (txg_history_t),
- offsetof(txg_history_t, th_link));
- dsl_pool_txg_history_add(dp, txg);
-
- (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa));
- dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc",
- KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL);
- if (dp->dp_txg_kstat) {
- dp->dp_txg_kstat->ks_data = NULL;
- dp->dp_txg_kstat->ks_private = dp;
- dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update;
- kstat_install(dp->dp_txg_kstat);
- }
-}
-
-static void
-dsl_pool_txg_history_destroy(dsl_pool_t *dp)
-{
- txg_history_t *th;
-
- if (dp->dp_txg_kstat) {
- if (dp->dp_txg_kstat->ks_data)
- kmem_free(dp->dp_txg_kstat->ks_data,
- dp->dp_txg_kstat->ks_data_size);
-
- kstat_delete(dp->dp_txg_kstat);
- }
-
- mutex_enter(&dp->dp_lock);
- while ((th = list_remove_head(&dp->dp_txg_history))) {
- dp->dp_txg_history_size--;
- mutex_destroy(&th->th_lock);
- kmem_free(th, sizeof(txg_history_t));
- }
-
- ASSERT3U(dp->dp_txg_history_size, ==, 0);
- list_destroy(&dp->dp_txg_history);
- mutex_exit(&dp->dp_lock);
-}
-
-txg_history_t *
-dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg)
-{
- txg_history_t *th, *rm;
-
- th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE);
- mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL);
- th->th_kstat.txg = txg;
- th->th_kstat.state = TXG_STATE_OPEN;
- th->th_kstat.birth = gethrtime();
-
- mutex_enter(&dp->dp_lock);
-
- list_insert_head(&dp->dp_txg_history, th);
- dp->dp_txg_history_size++;
-
- while (dp->dp_txg_history_size > zfs_txg_history) {
- dp->dp_txg_history_size--;
- rm = list_remove_tail(&dp->dp_txg_history);
- mutex_destroy(&rm->th_lock);
- kmem_free(rm, sizeof(txg_history_t));
- }
-
- mutex_exit(&dp->dp_lock);
-
- return (th);
-}
-
-/*
- * Traversed youngest to oldest because lookups are only done for open
- * or syncing txgs which are guaranteed to be at the head of the list.
- * The txg_history_t structure will be returned locked.
- */
-txg_history_t *
-dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg)
-{
- txg_history_t *th;
-
- mutex_enter(&dp->dp_lock);
- for (th = list_head(&dp->dp_txg_history); th != NULL;
- th = list_next(&dp->dp_txg_history, th)) {
- if (th->th_kstat.txg == txg) {
- mutex_enter(&th->th_lock);
- break;
- }
- }
- mutex_exit(&dp->dp_lock);
-
- return (th);
-}
-
-void
-dsl_pool_txg_history_put(txg_history_t *th)
-{
- mutex_exit(&th->th_lock);
-}
-
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
@@ -295,9 +100,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
1, 4, 0);
- dsl_pool_txg_history_init(dp, txg);
- dsl_pool_tx_assign_init(dp, 32);
-
return (dp);
}
@@ -438,8 +240,6 @@ dsl_pool_close(dsl_pool_t *dp)
arc_flush(dp->dp_spa);
txg_fini(dp);
dsl_scan_fini(dp);
- dsl_pool_tx_assign_destroy(dp);
- dsl_pool_txg_history_destroy(dp);
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
taskq_destroy(dp->dp_iput_taskq);
@@ -1241,9 +1041,6 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
module_param(zfs_txg_synctime_ms, int, 0644);
MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");
-module_param(zfs_txg_history, int, 0644);
-MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
-
module_param(zfs_write_limit_min, ulong, 0444);
MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9ba6e8963..a0b4797c9 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -48,6 +48,7 @@
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/kstat.h>
#include "zfs_prop.h"
#include "zfeature_common.h"
@@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL;
*/
int zfs_deadman_enabled = 1;
-
/*
* ==========================================================================
* SPA config locking
@@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
+ spa_stats_init(spa);
avl_add(&spa_namespace_avl, spa);
@@ -580,6 +581,7 @@ spa_remove(spa_t *spa)
refcount_destroy(&spa->spa_refcount);
+ spa_stats_destroy(spa);
spa_config_lock_destroy(spa);
for (t = 0; t < TXG_SIZE; t++)
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
new file mode 100644
index 000000000..789e8c3e6
--- /dev/null
+++ b/module/zfs/spa_stats.c
@@ -0,0 +1,636 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * Keeps stats on the last N txgs, disabled by default.
+ */
+int zfs_txg_history = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+ uint64_t uid; /* unique identifier */
+ hrtime_t start; /* time read completed */
+ uint64_t objset; /* read from this objset */
+ uint64_t object; /* read of this object number */
+ uint64_t level; /* block's indirection level */
+ uint64_t blkid; /* read of this block id */
+ char origin[24]; /* read originated from here */
+ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
+ pid_t pid; /* PID of task doing read */
+ char comm[16]; /* process name of task doing read */
+ list_node_t srh_link;
+} spa_read_history_t;
+
+static int
+spa_read_history_headers(char *buf, size_t size)
+{
+ size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+ "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+ "level", "blkid", "aflags", "origin", "pid", "process");
+ buf[size] = '\0';
+
+ return (0);
+}
+
+static int
+spa_read_history_data(char *buf, size_t size, void *data)
+{
+ spa_read_history_t *srh = (spa_read_history_t *)data;
+
+ size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx "
+ "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+ (u_longlong_t)srh->uid, srh->start,
+ (longlong_t)srh->objset, (longlong_t)srh->object,
+ (longlong_t)srh->level, (longlong_t)srh->blkid,
+ srh->aflags, srh->origin, srh->pid, srh->comm);
+ buf[size] = '\0';
+
+ return (0);
+}
+
+/*
+ * Calculate the address for the next spa_stats_history_t entry. The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static void *
+spa_read_history_addr(kstat_t *ksp, loff_t n)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+
+ ASSERT(MUTEX_HELD(&ssh->lock));
+
+ if (n == 0)
+ ssh->private = list_tail(&ssh->list);
+ else if (ssh->private)
+ ssh->private = list_prev(&ssh->list, ssh->private);
+
+ return (ssh->private);
+}
+
+/*
+ * When the kstat is written discard all spa_read_history_t entires. The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static int
+spa_read_history_update(kstat_t *ksp, int rw)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+
+ if (rw == KSTAT_WRITE) {
+ spa_read_history_t *srh;
+
+ while ((srh = list_remove_head(&ssh->list))) {
+ ssh->size--;
+ kmem_free(srh, sizeof(spa_read_history_t));
+ }
+
+ ASSERT3U(ssh->size, ==, 0);
+ }
+
+ ksp->ks_ndata = ssh->size;
+ ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t);
+
+ return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+ char name[KSTAT_STRLEN];
+ kstat_t *ksp;
+
+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&ssh->list, sizeof (spa_read_history_t),
+ offsetof(spa_read_history_t, srh_link));
+
+ ssh->count = 0;
+ ssh->size = 0;
+ ssh->private = NULL;
+
+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
+ name[KSTAT_STRLEN-1] = '\0';
+
+ ksp = kstat_create(name, 0, "reads", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ ssh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &ssh->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_read_history_update;
+ kstat_set_raw_ops(ksp, spa_read_history_headers,
+ spa_read_history_data, spa_read_history_addr);
+ kstat_install(ksp);
+ }
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+ spa_read_history_t *srh;
+ kstat_t *ksp;
+
+ ksp = ssh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_enter(&ssh->lock);
+ while ((srh = list_remove_head(&ssh->list))) {
+ ssh->size--;
+ kmem_free(srh, sizeof(spa_read_history_t));
+ }
+
+ ASSERT3U(ssh->size, ==, 0);
+ list_destroy(&ssh->list);
+ mutex_exit(&ssh->lock);
+
+ mutex_destroy(&ssh->lock);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+ spa_read_history_t *srh, *rm;
+
+ ASSERT3P(spa, !=, NULL);
+ ASSERT3P(zb, !=, NULL);
+
+ if (zfs_read_history == 0 && ssh->size == 0)
+ return;
+
+ if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED))
+ return;
+
+ srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE);
+ strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin));
+ strlcpy(srh->comm, getcomm(), sizeof(srh->comm));
+ srh->start = gethrtime();
+ srh->objset = zb->zb_objset;
+ srh->object = zb->zb_object;
+ srh->level = zb->zb_level;
+ srh->blkid = zb->zb_blkid;
+ srh->aflags = aflags;
+ srh->pid = getpid();
+
+ mutex_enter(&ssh->lock);
+
+ srh->uid = ssh->count++;
+ list_insert_head(&ssh->list, srh);
+ ssh->size++;
+
+ while (ssh->size > zfs_read_history) {
+ ssh->size--;
+ rm = list_remove_tail(&ssh->list);
+ kmem_free(rm, sizeof(spa_read_history_t));
+ }
+
+ mutex_exit(&ssh->lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA TXG History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Txg statistics - Information exported regarding each txg sync
+ */
+
+typedef struct spa_txg_history {
+ uint64_t txg; /* txg id */
+ txg_state_t state; /* active txg state */
+ uint64_t nread; /* number of bytes read */
+ uint64_t nwritten; /* number of bytes written */
+ uint64_t reads; /* number of read operations */
+ uint64_t writes; /* number of write operations */
+ uint64_t nreserved; /* number of bytes reserved */
+ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
+ list_node_t sth_link;
+} spa_txg_history_t;
+
+static int
+spa_txg_history_headers(char *buf, size_t size)
+{
+ size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s "
+ "%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state",
+ "nreserved", "nread", "nwritten", "reads", "writes",
+ "otime", "qtime", "stime");
+ buf[size] = '\0';
+
+ return (0);
+}
+
+static int
+spa_txg_history_data(char *buf, size_t size, void *data)
+{
+ spa_txg_history_t *sth = (spa_txg_history_t *)data;
+ uint64_t open = 0, quiesce = 0, sync = 0;
+ char state;
+
+ switch (sth->state) {
+ case TXG_STATE_BIRTH: state = 'B'; break;
+ case TXG_STATE_OPEN: state = 'O'; break;
+ case TXG_STATE_QUIESCED: state = 'Q'; break;
+ case TXG_STATE_SYNCED: state = 'S'; break;
+ case TXG_STATE_COMMITTED: state = 'C'; break;
+ default: state = '?'; break;
+ }
+
+ if (sth->times[TXG_STATE_OPEN])
+ open = sth->times[TXG_STATE_OPEN] -
+ sth->times[TXG_STATE_BIRTH];
+
+ if (sth->times[TXG_STATE_QUIESCED])
+ quiesce = sth->times[TXG_STATE_QUIESCED] -
+ sth->times[TXG_STATE_OPEN];
+
+ if (sth->times[TXG_STATE_SYNCED])
+ sync = sth->times[TXG_STATE_SYNCED] -
+ sth->times[TXG_STATE_QUIESCED];
+
+ size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu "
+ "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n",
+ (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
+ (u_longlong_t)sth->nreserved,
+ (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
+ (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
+ (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync);
+ buf[size] = '\0';
+
+ return (0);
+}
+
+/*
+ * Calculate the address for the next spa_stats_history_t entry. The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static void *
+spa_txg_history_addr(kstat_t *ksp, loff_t n)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+
+ ASSERT(MUTEX_HELD(&ssh->lock));
+
+ if (n == 0)
+ ssh->private = list_tail(&ssh->list);
+ else if (ssh->private)
+ ssh->private = list_prev(&ssh->list, ssh->private);
+
+ return (ssh->private);
+}
+
+/*
+ * When the kstat is written discard all spa_txg_history_t entires. The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static int
+spa_txg_history_update(kstat_t *ksp, int rw)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+
+ ASSERT(MUTEX_HELD(&ssh->lock));
+
+ if (rw == KSTAT_WRITE) {
+ spa_txg_history_t *sth;
+
+ while ((sth = list_remove_head(&ssh->list))) {
+ ssh->size--;
+ kmem_free(sth, sizeof(spa_txg_history_t));
+ }
+
+ ASSERT3U(ssh->size, ==, 0);
+ }
+
+ ksp->ks_ndata = ssh->size;
+ ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t);
+
+ return (0);
+}
+
+static void
+spa_txg_history_init(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+ char name[KSTAT_STRLEN];
+ kstat_t *ksp;
+
+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&ssh->list, sizeof (spa_txg_history_t),
+ offsetof(spa_txg_history_t, sth_link));
+
+ ssh->count = 0;
+ ssh->size = 0;
+ ssh->private = NULL;
+
+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
+ name[KSTAT_STRLEN-1] = '\0';
+
+ ksp = kstat_create(name, 0, "txgs", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ ssh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &ssh->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_txg_history_update;
+ kstat_set_raw_ops(ksp, spa_txg_history_headers,
+ spa_txg_history_data, spa_txg_history_addr);
+ kstat_install(ksp);
+ }
+}
+
+static void
+spa_txg_history_destroy(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ kstat_t *ksp;
+
+ ksp = ssh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_enter(&ssh->lock);
+ while ((sth = list_remove_head(&ssh->list))) {
+ ssh->size--;
+ kmem_free(sth, sizeof(spa_txg_history_t));
+ }
+
+ ASSERT3U(ssh->size, ==, 0);
+ list_destroy(&ssh->list);
+ mutex_exit(&ssh->lock);
+
+ mutex_destroy(&ssh->lock);
+}
+
+/*
+ * Add a new txg to historical record.
+ */
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth, *rm;
+
+ if (zfs_txg_history == 0 && ssh->size == 0)
+ return;
+
+ sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE);
+ sth->txg = txg;
+ sth->state = TXG_STATE_OPEN;
+ sth->times[TXG_STATE_BIRTH] = gethrtime();
+
+ mutex_enter(&ssh->lock);
+
+ list_insert_head(&ssh->list, sth);
+ ssh->size++;
+
+ while (ssh->size > zfs_txg_history) {
+ ssh->size--;
+ rm = list_remove_tail(&ssh->list);
+ kmem_free(rm, sizeof(spa_txg_history_t));
+ }
+
+ mutex_exit(&ssh->lock);
+}
+
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+ hrtime_t completed_time)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ int error = ENOENT;
+
+ if (zfs_txg_history == 0)
+ return (0);
+
+ mutex_enter(&ssh->lock);
+ for (sth = list_head(&ssh->list); sth != NULL;
+ sth = list_next(&ssh->list, sth)) {
+ if (sth->txg == txg) {
+ sth->times[completed_state] = completed_time;
+ sth->state++;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&ssh->lock);
+
+ return (error);
+}
+
+/*
+ * Set txg IO stats.
+ */
+int
+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ int error = ENOENT;
+
+ if (zfs_txg_history == 0)
+ return (0);
+
+ mutex_enter(&ssh->lock);
+ for (sth = list_head(&ssh->list); sth != NULL;
+ sth = list_next(&ssh->list, sth)) {
+ if (sth->txg == txg) {
+ sth->nread = nread;
+ sth->nwritten = nwritten;
+ sth->reads = reads;
+ sth->writes = writes;
+ sth->nreserved = nreserved;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&ssh->lock);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA TX Assign Histogram Routines
+ * ==========================================================================
+ */
+
+/*
+ * Tx statistics - Information exported regarding dmu_tx_assign time.
+ */
+
+/*
+ * When the kstat is written zero all buckets. When the kstat is read
+ * count the number of trailing buckets set to zero and update ks_ndata
+ * such that they are not output.
+ */
+static int
+spa_tx_assign_update(kstat_t *ksp, int rw)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+ int i;
+
+ if (rw == KSTAT_WRITE) {
+ for (i = 0; i < ssh->count; i++)
+ ((kstat_named_t *)ssh->private)[i].value.ui64 = 0;
+ }
+
+ for (i = ssh->count; i > 0; i--)
+ if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0)
+ break;
+
+ ksp->ks_ndata = i;
+ ksp->ks_data_size = i * sizeof(kstat_named_t);
+
+ return (0);
+}
+
+static void
+spa_tx_assign_init(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+ char name[KSTAT_STRLEN];
+ kstat_named_t *ks;
+ kstat_t *ksp;
+ int i;
+
+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ssh->count = 42; /* power of two buckets for 1ns to 2,199s */
+ ssh->size = ssh->count * sizeof(kstat_named_t);
+ ssh->private = kmem_alloc(ssh->size, KM_SLEEP);
+
+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
+ name[KSTAT_STRLEN-1] = '\0';
+
+ for (i = 0; i < ssh->count; i++) {
+ ks = &((kstat_named_t *)ssh->private)[i];
+ ks->data_type = KSTAT_DATA_UINT64;
+ ks->value.ui64 = 0;
+ (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
+ (u_longlong_t)1 << i);
+ }
+
+ ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
+ KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
+ ssh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &ssh->lock;
+ ksp->ks_data = ssh->private;
+ ksp->ks_ndata = ssh->count;
+ ksp->ks_data_size = ssh->size;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_tx_assign_update;
+ kstat_install(ksp);
+ }
+}
+
+static void
+spa_tx_assign_destroy(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+ kstat_t *ksp;
+
+ ksp = ssh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ kmem_free(ssh->private, ssh->size);
+ mutex_destroy(&ssh->lock);
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+ uint64_t idx = 0;
+
+ while (((1 << idx) < nsecs) && (idx < ssh->size - 1))
+ idx++;
+
+ atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+ spa_read_history_init(spa);
+ spa_txg_history_init(spa);
+ spa_tx_assign_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+ spa_tx_assign_destroy(spa);
+ spa_txg_history_destroy(spa);
+ spa_read_history_destroy(spa);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_read_history, int, 0644);
+MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads");
+
+module_param(zfs_read_history_hits, int, 0644);
+MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history");
+
+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
+#endif
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index b3e537f45..7a3da8647 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -27,11 +27,11 @@
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
+#include <sys/spa_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/callb.h>
-#include <sys/spa_impl.h>
/*
* ZFS Transaction Groups
@@ -351,8 +351,6 @@ txg_rele_to_sync(txg_handle_t *th)
static void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
- hrtime_t start;
- txg_history_t *th;
tx_state_t *tx = &dp->dp_tx;
int g = txg & TXG_MASK;
int c;
@@ -366,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
+ spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
+
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
@@ -374,19 +375,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
mutex_exit(&tx->tx_cpu[c].tc_lock);
/*
- * Measure how long the txg was open and replace the kstat.
- */
- th = dsl_pool_txg_history_get(dp, txg);
- th->th_kstat.open_time = gethrtime() - th->th_kstat.birth;
- th->th_kstat.state = TXG_STATE_QUIESCING;
- dsl_pool_txg_history_put(th);
- dsl_pool_txg_history_add(dp, tx->tx_open_txg);
-
- /*
* Quiesce the transaction group by waiting for everyone to txg_exit().
*/
- start = gethrtime();
-
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
mutex_enter(&tc->tc_lock);
@@ -395,12 +385,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
mutex_exit(&tc->tc_lock);
}
- /*
- * Measure how long the txg took to quiesce.
- */
- th = dsl_pool_txg_history_get(dp, txg);
- th->th_kstat.quiesce_time = gethrtime() - start;
- dsl_pool_txg_history_put(th);
+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
}
static void
@@ -472,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp)
spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
+ vdev_stat_t *vs1, *vs2;
uint64_t start, delta;
#ifdef _KERNEL
@@ -485,10 +471,11 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_enter(tx, &cpr);
+ vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
+ vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
+
start = delta = 0;
for (;;) {
- hrtime_t hrstart;
- txg_history_t *th;
uint64_t timer, timeout;
uint64_t txg;
@@ -522,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
}
- if (tx->tx_exiting)
+ if (tx->tx_exiting) {
+ kmem_free(vs2, sizeof(vdev_stat_t));
+ kmem_free(vs1, sizeof(vdev_stat_t));
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+ }
+
+ vdev_get_stats(spa->spa_root_vdev, vs1);
/*
* Consume the quiesced txg which has been handed off to
@@ -535,17 +527,11 @@ txg_sync_thread(dsl_pool_t *dp)
tx->tx_syncing_txg = txg;
cv_broadcast(&tx->tx_quiesce_more_cv);
- th = dsl_pool_txg_history_get(dp, txg);
- th->th_kstat.state = TXG_STATE_SYNCING;
- vdev_get_stats(spa->spa_root_vdev, &th->th_vs1);
- dsl_pool_txg_history_put(th);
-
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
start = ddi_get_lbolt();
- hrstart = gethrtime();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
@@ -559,22 +545,15 @@ txg_sync_thread(dsl_pool_t *dp)
*/
txg_dispatch_callbacks(dp, txg);
- /*
- * Measure the txg sync time determine the amount of I/O done.
- */
- th = dsl_pool_txg_history_get(dp, txg);
- vdev_get_stats(spa->spa_root_vdev, &th->th_vs2);
- th->th_kstat.sync_time = gethrtime() - hrstart;
- th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] -
- th->th_vs1.vs_bytes[ZIO_TYPE_READ];
- th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] -
- th->th_vs1.vs_bytes[ZIO_TYPE_WRITE];
- th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] -
- th->th_vs1.vs_ops[ZIO_TYPE_READ];
- th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] -
- th->th_vs1.vs_ops[ZIO_TYPE_WRITE];
- th->th_kstat.state = TXG_STATE_COMMITTED;
- dsl_pool_txg_history_put(th);
+ vdev_get_stats(spa->spa_root_vdev, vs2);
+ spa_txg_history_set_io(spa, txg,
+ vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
+ vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
+ vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
+ vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
+ dp->dp_space_towrite[txg & TXG_MASK] +
+ dp->dp_tempreserved[txg & TXG_MASK] / 2);
+ spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
}
}