aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zpool/Makefile.am2
-rw-r--r--cmd/zpool/zpool_iter.c66
-rw-r--r--cmd/zpool/zpool_main.c1330
-rw-r--r--cmd/zpool/zpool_util.c25
-rw-r--r--cmd/zpool/zpool_util.h6
-rw-r--r--include/libzfs.h11
-rw-r--r--include/sys/fs/zfs.h73
-rw-r--r--include/sys/vdev.h3
-rw-r--r--include/sys/vdev_impl.h1
-rw-r--r--include/sys/zfs_context.h1
-rw-r--r--include/sys/zio.h3
-rw-r--r--include/sys/zio_priority.h3
-rw-r--r--lib/libspl/include/sys/sysmacros.h3
-rw-r--r--lib/libzfs/libzfs_pool.c2
-rw-r--r--lib/libzfs/libzfs_util.c94
-rw-r--r--lib/libzpool/kernel.c44
-rw-r--r--lib/libzpool/util.c7
-rw-r--r--man/man8/zpool.8209
-rw-r--r--module/zfs/spa.c2
-rw-r--r--module/zfs/vdev.c149
-rw-r--r--module/zfs/vdev_disk.c9
-rw-r--r--module/zfs/vdev_label.c107
-rw-r--r--module/zfs/zio.c9
-rw-r--r--tests/runfiles/linux.run2
-rw-r--r--tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am3
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh2
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh7
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh5
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh74
29 files changed, 2074 insertions, 178 deletions
diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am
index c11951b22..b4ff106e1 100644
--- a/cmd/zpool/Makefile.am
+++ b/cmd/zpool/Makefile.am
@@ -19,4 +19,4 @@ zpool_LDADD = \
$(top_builddir)/lib/libzpool/libzpool.la \
$(top_builddir)/lib/libzfs/libzfs.la \
$(top_builddir)/lib/libzfs_core/libzfs_core.la \
- $(LIBBLKID)
+ -lm $(LIBBLKID)
diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c
index 952d19172..a18ccf29d 100644
--- a/cmd/zpool/zpool_iter.c
+++ b/cmd/zpool/zpool_iter.c
@@ -250,3 +250,69 @@ for_each_pool(int argc, char **argv, boolean_t unavail,
return (ret);
}
+
+static int
+for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func,
+ void *data)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ int ret = 0;
+ int i;
+ char *type;
+
+ const char *list[] = {
+ ZPOOL_CONFIG_SPARES,
+ ZPOOL_CONFIG_L2CACHE,
+ ZPOOL_CONFIG_CHILDREN
+ };
+
+ for (i = 0; i < ARRAY_SIZE(list); i++) {
+ if (nvlist_lookup_nvlist_array(nv, list[i], &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++) {
+ uint64_t ishole = 0;
+
+ (void) nvlist_lookup_uint64(child[c],
+ ZPOOL_CONFIG_IS_HOLE, &ishole);
+
+ if (ishole)
+ continue;
+
+ ret |= for_each_vdev_cb(zhp, child[c], func,
+ data);
+ }
+ }
+ }
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (ret);
+
+ /* Don't run our function on root vdevs */
+ if (strcmp(type, VDEV_TYPE_ROOT) != 0) {
+ ret |= func(zhp, nv, data);
+ }
+
+ return (ret);
+}
+
+/*
+ * This is the equivalent of for_each_pool() for vdevs. It iterates thorough
+ * all vdevs in the pool, ignoring root vdevs and holes, calling func() on
+ * each one.
+ *
+ * @zhp: Zpool handle
+ * @func: Function to call on each vdev
+ * @data: Custom data to pass to the function
+ */
+int
+for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data)
+{
+ nvlist_t *config, *nvroot;
+
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ }
+ return (for_each_vdev_cb(zhp, nvroot, func, data));
+}
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 9c7e2a0c4..6412a8e93 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -51,6 +51,7 @@
#include <sys/fm/util.h>
#include <sys/fm/protocol.h>
#include <sys/zfs_ioctl.h>
+#include <math.h>
#include <libzfs.h>
@@ -144,6 +145,23 @@ typedef enum {
} zpool_help_t;
+/*
+ * Flags for stats to display with "zpool iostats"
+ */
+enum iostat_type {
+ IOS_DEFAULT = 0,
+ IOS_LATENCY = 1,
+ IOS_QUEUES = 2,
+ IOS_L_HISTO = 3,
+ IOS_COUNT, /* always last element */
+};
+
+/* iostat_type entries as bitmasks */
+#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT)
+#define IOS_LATENCY_M (1ULL << IOS_LATENCY)
+#define IOS_QUEUES_M (1ULL << IOS_QUEUES)
+#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO)
+
typedef struct zpool_command {
const char *name;
int (*func)(int, char **);
@@ -196,7 +214,7 @@ static zpool_command_t command_table[] = {
{ "set", zpool_do_set, HELP_SET },
};
-#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
+#define NCOMMAND (ARRAY_SIZE(command_table))
static zpool_command_t *current_command;
static char history_str[HIS_MAX_RECORD_LEN];
@@ -237,7 +255,8 @@ get_usage(zpool_help_t idx) {
"[-R root] [-F [-n]]\n"
"\t <pool | id> [newpool]\n"));
case HELP_IOSTAT:
- return (gettext("\tiostat [-gLPvy] [-T d|u] [pool] ... "
+ return (gettext("\tiostat [-T d | u] [-ghHLpPvy] [[-lq]|-w]\n"
+ "\t [[pool ...]|[pool vdev ...]|[vdev ...]] "
"[interval [count]]\n"));
case HELP_LABELCLEAR:
return (gettext("\tlabelclear [-f] <vdev>\n"));
@@ -2481,61 +2500,690 @@ error:
}
typedef struct iostat_cbdata {
- boolean_t cb_verbose;
+ uint64_t cb_flags;
int cb_name_flags;
int cb_namewidth;
int cb_iteration;
+ char **cb_vdev_names; /* Only show these vdevs */
+ unsigned int cb_vdev_names_count;
+ boolean_t cb_verbose;
+ boolean_t cb_literal;
+ boolean_t cb_scripted;
zpool_list_t *cb_list;
} iostat_cbdata_t;
+/* iostat labels */
+typedef struct name_and_columns {
+ const char *name; /* Column name */
+ unsigned int columns; /* Center name to this number of columns */
+} name_and_columns_t;
+
+#define IOSTAT_MAX_LABELS 11 /* Max number of labels on one line */
+
+static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
+{
+ [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
+ {NULL}},
+ [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+ {"asyncq_wait", 2}, {"scrub"}},
+ [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
+ {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
+ {NULL}},
+ [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2},
+ {"sync_queue", 2}, {"async_queue", 2}, {NULL}},
+};
+
+/* Shorthand - if "columns" field not set, default to 1 column */
+static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
+{
+ [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {NULL}},
+ [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}},
+ [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"},
+ {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}},
+ [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}},
+};
+
+/*
+ * Return the number of labels in a null-terminated name_and_columns_t
+ * array.
+ *
+ */
+static unsigned int
+label_array_len(const name_and_columns_t *labels)
+{
+ int i = 0;
+
+ while (labels[i].name)
+ i++;
+
+ return (i);
+}
+
+/*
+ * Return a default column width for default/latency/queue columns. This does
+ * not include histograms, which have their columns autosized.
+ */
+static unsigned int
+default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
+{
+ unsigned long column_width = 5; /* Normal niceprint */
+ static unsigned long widths[] = {
+ /*
+ * Choose some sane default column sizes for printing the
+ * raw numbers.
+ */
+ [IOS_DEFAULT] = 15, /* 1PB capacity */
+ [IOS_LATENCY] = 10, /* 1B ns = 10sec */
+ [IOS_QUEUES] = 6, /* 1M queue entries */
+ };
+
+ if (cb->cb_literal)
+ column_width = widths[type];
+
+ return (column_width);
+}
+
+/*
+ * Print the column labels, i.e:
+ *
+ * capacity operations bandwidth
+ * alloc free read write read write ...
+ *
+ * If force_column_width is set, use it for the column width. If not set, use
+ * the default column width.
+ */
+void
+print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
+{
+ int i, idx, s;
+ unsigned int text_start, rw_column_width, spaces_to_end;
+ uint64_t flags = cb->cb_flags;
+ uint64_t f;
+ unsigned int column_width = force_column_width;
+
+ /* For each bit set in flags */
+ for (f = flags; f; f &= ~(1ULL << idx)) {
+ idx = lowbit64(f) - 1;
+ if (!force_column_width)
+ column_width = default_column_width(cb, idx);
+ /* Print our top labels centered over "read write" label. */
+ for (i = 0; i < label_array_len(labels[idx]); i++) {
+ const char *name = labels[idx][i].name;
+ /*
+ * We treat labels[][].columns == 0 as shorthand
+ * for one column. It makes writing out the label
+ * tables more concise.
+ */
+ unsigned int columns = MAX(1, labels[idx][i].columns);
+ unsigned int slen = strlen(name);
+
+ rw_column_width = (column_width * columns) +
+ (2 * (columns - 1));
+
+ text_start = (int) ((rw_column_width)/columns -
+ slen/columns);
+
+ printf(" "); /* Two spaces between columns */
+
+ /* Space from beginning of column to label */
+ for (s = 0; s < text_start; s++)
+ printf(" ");
+
+ printf("%s", name);
+
+ /* Print space after label to end of column */
+ spaces_to_end = rw_column_width - text_start - slen;
+ for (s = 0; s < spaces_to_end; s++)
+ printf(" ");
+
+ }
+ }
+ printf("\n");
+}
+
+/*
+ * Utility function to print out a line of dashes like:
+ *
+ * -------------------------------- ----- ----- ----- ----- -----
+ *
+ * ...or a dashed named-row line like:
+ *
+ * logs - - - - -
+ *
+ * @cb: iostat data
+ *
+ * @force_column_width If non-zero, use the value as the column width.
+ * Otherwise use the default column widths.
+ *
+ * @name: Print a dashed named-row line starting
+ * with @name. Otherwise, print a regular
+ * dashed line.
+ */
+static void
+print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const char *name)
+{
+ int i;
+ unsigned int namewidth;
+ uint64_t flags = cb->cb_flags;
+ uint64_t f;
+ int idx;
+ const name_and_columns_t *labels;
+
+ if (cb->cb_flags & IOS_L_HISTO_M)
+ namewidth = MAX(cb->cb_namewidth, strlen("latency"));
+ else
+ namewidth = cb->cb_namewidth;
+
+ if (name) {
+ namewidth = MAX(cb->cb_namewidth, strlen(name));
+ printf("%-*s", namewidth, name);
+ } else {
+ for (i = 0; i < namewidth; i++)
+ (void) printf("-");
+ }
+
+ /* For each bit in flags */
+ for (f = flags; f; f &= ~(1ULL << idx)) {
+ unsigned int column_width;
+ idx = lowbit64(f) - 1;
+ if (force_column_width)
+ column_width = force_column_width;
+ else
+ column_width = default_column_width(cb, idx);
+
+ labels = iostat_bottom_labels[idx];
+ for (i = 0; i < label_array_len(labels); i++) {
+ if (name)
+ printf(" %*s-", column_width - 1, " ");
+ else
+ printf(" %.*s", column_width,
+ "--------------------");
+ }
+ }
+ printf("\n");
+}
+
+
+static void
+print_iostat_separator_impl(iostat_cbdata_t *cb,
+ unsigned int force_column_width)
+{
+ print_iostat_dashes(cb, force_column_width, NULL);
+}
+
static void
print_iostat_separator(iostat_cbdata_t *cb)
{
- int i = 0;
+ print_iostat_separator_impl(cb, 0);
+}
+
+static void
+print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const char *histo_vdev_name)
+{
+ unsigned int namewidth;
+ uint64_t flags = cb->cb_flags;
+
+ if (flags & IOS_L_HISTO_M)
+ namewidth = MAX(cb->cb_namewidth, strlen("latency"));
+ else
+ namewidth = cb->cb_namewidth;
+
+ if (flags & IOS_L_HISTO_M)
+ printf("%-*s", namewidth, histo_vdev_name);
+ else
+ printf("%*s", namewidth, "");
- for (i = 0; i < cb->cb_namewidth; i++)
- (void) printf("-");
- (void) printf(" ----- ----- ----- ----- ----- -----\n");
+ print_iostat_labels(cb, force_column_width, iostat_top_labels);
+
+ printf("%-*s", namewidth, flags & IOS_L_HISTO_M ? "latency" :
+ cb->cb_vdev_names_count ? "vdev" : "pool");
+
+ print_iostat_labels(cb, force_column_width, iostat_bottom_labels);
+
+ print_iostat_separator_impl(cb, force_column_width);
}
static void
print_iostat_header(iostat_cbdata_t *cb)
{
- (void) printf("%*s capacity operations bandwidth\n",
- cb->cb_namewidth, "");
- (void) printf("%-*s alloc free read write read write\n",
- cb->cb_namewidth, "pool");
- print_iostat_separator(cb);
+ print_iostat_header_impl(cb, 0, NULL);
}
+
/*
* Display a single statistic.
*/
static void
-print_one_stat(uint64_t value)
+print_one_stat(uint64_t value, enum zfs_nicenum_format format,
+ unsigned int column_size, boolean_t scripted)
{
char buf[64];
- zfs_nicenum(value, buf, sizeof (buf));
- (void) printf(" %5s", buf);
+ zfs_nicenum_format(value, buf, sizeof (buf), format);
+
+ if (scripted)
+ printf("\t%s", buf);
+ else
+ printf(" %*s", column_size, buf);
+}
+
+/*
+ * Calculate the default vdev stats
+ *
+ * Subtract oldvs from newvs, apply a scaling factor, and save the resulting
+ * stats into calcvs.
+ */
+static void
+calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs,
+ vdev_stat_t *calcvs)
+{
+ int i;
+
+ memcpy(calcvs, newvs, sizeof (*calcvs));
+ for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++)
+ calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]);
+
+ for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++)
+ calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]);
+}
+
+/*
+ * Internal representation of the extended iostats data.
+ *
+ * The extended iostat stats are exported in nvlists as either uint64_t arrays
+ * or single uint64_t's. We make both look like arrays to make them easier
+ * to process. In order to make single uint64_t's look like arrays, we set
+ * __data to the stat data, and then set *data = &__data with count = 1. Then,
+ * we can just use *data and count.
+ */
+struct stat_array {
+ uint64_t *data;
+ uint_t count; /* Number of entries in data[] */
+ uint64_t __data; /* Only used when data is a single uint64_t */
+};
+
+static uint64_t
+stat_histo_max(struct stat_array *nva, unsigned int len) {
+ uint64_t max = 0;
+ int i;
+ for (i = 0; i < len; i++)
+ max = MAX(max, array64_max(nva[i].data, nva[i].count));
+
+ return (max);
+}
+
+/*
+ * Helper function to lookup a uint64_t array or uint64_t value and store its
+ * data as a stat_array. If the nvpair is a single uint64_t value, then we make
+ * it look like a one element array to make it easier to process.
+ */
+static int
+nvpair64_to_stat_array(nvlist_t *nvl, const char *name,
+ struct stat_array *nva) {
+ nvpair_t *tmp;
+ int ret;
+
+ verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0);
+ switch (nvpair_type(tmp)) {
+ case DATA_TYPE_UINT64_ARRAY:
+ ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvpair_value_uint64(tmp, &nva->__data);
+ nva->data = &nva->__data;
+ nva->count = 1;
+ break;
+ default:
+ /* Not a uint64_t */
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Given a list of nvlist names, look up the extended stats in newnv and oldnv,
+ * subtract them, and return the results in a newly allocated stat_array.
+ * You must free the returned array after you are done with it with
+ * free_calc_stats().
+ *
+ * Additionally, you can set "oldnv" to NULL if you simply want the newnv
+ * values.
+ */
+static struct stat_array *
+calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ nvlist_t *oldnvx = NULL, *newnvx;
+ struct stat_array *oldnva, *newnva, *calcnva;
+ int i, j;
+ unsigned int alloc_size = (sizeof (struct stat_array)) * len;
+
+ /* Extract our extended stats nvlist from the main list */
+ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &newnvx) == 0);
+ if (oldnv) {
+ verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &oldnvx) == 0);
+ }
+
+ newnva = safe_malloc(alloc_size);
+ oldnva = safe_malloc(alloc_size);
+ calcnva = safe_malloc(alloc_size);
+
+ for (j = 0; j < len; j++) {
+ verify(nvpair64_to_stat_array(newnvx, names[j],
+ &newnva[j]) == 0);
+ calcnva[j].count = newnva[j].count;
+ alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]);
+ calcnva[j].data = safe_malloc(alloc_size);
+ memcpy(calcnva[j].data, newnva[j].data, alloc_size);
+
+ if (oldnvx) {
+ verify(nvpair64_to_stat_array(oldnvx, names[j],
+ &oldnva[j]) == 0);
+ for (i = 0; i < oldnva[j].count; i++)
+ calcnva[j].data[i] -= oldnva[j].data[i];
+ }
+ }
+ free(newnva);
+ free(oldnva);
+ return (calcnva);
+}
+
+static void
+free_calc_stats(struct stat_array *nva, unsigned int len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ free(nva[i].data);
+
+ free(nva);
+}
+
+static void
+print_iostat_histo(struct stat_array *nva, unsigned int len,
+ iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth,
+ double scale)
+{
+ int i, j;
+ char buf[6];
+ uint64_t val;
+ enum zfs_nicenum_format format;
+ unsigned int buckets;
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ /* All these histos are the same size, so just use nva[0].count */
+ buckets = nva[0].count;
+
+ for (j = 0; j < buckets; j++) {
+ /* Ending range of this bucket */
+ val = (1UL << (j + 1)) - 1;
+
+ /* Print histogram bucket label */
+ zfs_nicetime(val, buf, sizeof (buf));
+ if (cb->cb_scripted)
+ printf("%llu", (u_longlong_t) val);
+ else
+ printf("%-*s", namewidth, buf);
+
+ /* Print the values on the line */
+ for (i = 0; i < len; i++) {
+ print_one_stat(nva[i].data[j] * scale, format,
+ column_width, cb->cb_scripted);
+ }
+ printf("\n");
+ }
+}
+
+static void
+print_solid_separator(unsigned int length)
+{
+ while (length--)
+ printf("-");
+ printf("\n");
+}
+
+static void
+print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv, double scale, const char *name)
+{
+ unsigned int column_width;
+ unsigned int namewidth;
+ unsigned int entire_width;
+
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ };
+ struct stat_array *nva;
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv);
+
+ if (cb->cb_literal) {
+ column_width = MAX(5,
+ (unsigned int) log10(stat_histo_max(nva,
+ ARRAY_SIZE(names))) + 1);
+ } else {
+ column_width = 5;
+ }
+
+ namewidth = MAX(cb->cb_namewidth, strlen("latency"));
+
+ /*
+ * Calculate the entire line width of what we're printing. The
+ * +2 is for the two spaces between columns:
+ */
+ /* read write */
+ /* ----- ----- */
+ /* |___| <---------- column_width */
+ /* */
+ /* |__________| <--- entire_width */
+ /* */
+ entire_width = namewidth + (column_width + 2) *
+ label_array_len(iostat_bottom_labels[IOS_L_HISTO]);
+
+ if (cb->cb_scripted)
+ printf("%s\n", name);
+ else
+ print_iostat_header_impl(cb, column_width, name);
+
+ print_iostat_histo(nva, ARRAY_SIZE(names), cb, column_width,
+ namewidth, scale);
+
+ free_calc_stats(nva, ARRAY_SIZE(names));
+ if (!cb->cb_scripted)
+ print_solid_separator(entire_width);
+}
+
+/*
+ * Calculate the average latency of a power-of-two latency histogram
+ */
+static uint64_t
+single_histo_average(uint64_t *histo, unsigned int buckets)
+{
+ int i;
+ uint64_t count = 0, total = 0;
+
+ for (i = 0; i < buckets; i++) {
+ /*
+ * Our buckets are power-of-two latency ranges. Use the
+ * midpoint latency of each bucket to calculate the average.
+ * For example:
+ *
+ * Bucket Midpoint
+ * 8ns-15ns: 12ns
+ * 16ns-31ns: 24ns
+ * ...
+ */
+ if (histo[i] != 0) {
+ total += histo[i] * (((1UL << i) + ((1UL << i)/2)));
+ count += histo[i];
+ }
+ }
+
+ /* Prevent divide by zero */
+ return (count == 0 ? 0 : total / count);
+}
+
+static void
+print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv, double scale)
+{
+ int i;
+ uint64_t val;
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ };
+
+ struct stat_array *nva;
+
+ unsigned int column_width = default_column_width(cb, IOS_QUEUES);
+ enum zfs_nicenum_format format;
+
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv);
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ val = nva[i].data[0] * scale;
+ print_one_stat(val, format, column_width, cb->cb_scripted);
+ }
+
+ free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+static void
+print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv, double scale)
+{
+ int i;
+ uint64_t val;
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ };
+ struct stat_array *nva;
+
+ unsigned int column_width = default_column_width(cb, IOS_LATENCY);
+ enum zfs_nicenum_format format;
+
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv);
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_TIME;
+
+ /* Print our avg latencies on the line */
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ /* Compute average latency for a latency histo */
+ val = single_histo_average(nva[i].data, nva[i].count) * scale;
+ print_one_stat(val, format, column_width, cb->cb_scripted);
+ }
+ free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+/*
+ * Print default statistics (capacity/operations/bandwidth)
+ */
+static void
+print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale)
+{
+ unsigned int column_width = default_column_width(cb, IOS_DEFAULT);
+ enum zfs_nicenum_format format;
+ char na; /* char to print for "not applicable" values */
+
+ if (cb->cb_literal) {
+ format = ZFS_NICENUM_RAW;
+ na = '0';
+ } else {
+ format = ZFS_NICENUM_1024;
+ na = '-';
+ }
+
+ /* only toplevel vdevs have capacity stats */
+ if (vs->vs_space == 0) {
+ if (cb->cb_scripted)
+ printf("\t%c\t%c", na, na);
+ else
+ printf(" %*c %*c", column_width, na, column_width,
+ na);
+ } else {
+ print_one_stat(vs->vs_alloc, format, column_width,
+ cb->cb_scripted);
+ print_one_stat(vs->vs_space - vs->vs_alloc, format,
+ column_width, cb->cb_scripted);
+ }
+
+ print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale),
+ format, column_width, cb->cb_scripted);
}
/*
* Print out all the statistics for the given vdev. This can either be the
* toplevel configuration, or called recursively. If 'name' is NULL, then this
* is a verbose output, and we don't want to display the toplevel pool stats.
+ *
+ * Returns the number of stat lines printed.
*/
-void
+unsigned int
print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
{
nvlist_t **oldchild, **newchild;
uint_t c, children;
- vdev_stat_t *oldvs, *newvs;
+ vdev_stat_t *oldvs, *newvs, *calcvs;
vdev_stat_t zerovs = { 0 };
+ char *vname;
+ int i;
+ int ret = 0;
uint64_t tdelta;
double scale;
- char *vname;
+
+ calcvs = safe_malloc(sizeof (*calcvs));
if (oldnv != NULL) {
verify(nvlist_lookup_uint64_array(oldnv,
@@ -2544,54 +3192,92 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
oldvs = &zerovs;
}
+ /* Do we only want to see a specific vdev? */
+ for (i = 0; i < cb->cb_vdev_names_count; i++) {
+ /* Yes we do. Is this the vdev? */
+ if (strcmp(name, cb->cb_vdev_names[i]) == 0) {
+ /*
+ * This is our vdev. Since it is the only vdev we
+ * will be displaying, make depth = 0 so that it
+ * doesn't get indented.
+ */
+ depth = 0;
+ break;
+ }
+ }
+
+ if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) {
+ /* Couldn't match the name */
+ goto children;
+ }
+
+
verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&newvs, &c) == 0);
- if (strlen(name) + depth > cb->cb_namewidth)
- (void) printf("%*s%s", depth, "", name);
- else
- (void) printf("%*s%s%*s", depth, "", name,
- (int)(cb->cb_namewidth - strlen(name) - depth), "");
+ /*
+ * Print the vdev name unless it's is a histogram. Histograms
+ * display the vdev name in the header itself.
+ */
+ if (!(cb->cb_flags & IOS_L_HISTO_M)) {
+ if (cb->cb_scripted) {
+ printf("%s", name);
+ } else {
+ if (strlen(name) + depth > cb->cb_namewidth)
+ (void) printf("%*s%s", depth, "", name);
+ else
+ (void) printf("%*s%s%*s", depth, "", name,
+ (int)(cb->cb_namewidth - strlen(name) -
+ depth), "");
+ }
+ }
+ /* Calculate our scaling factor */
tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
-
- if (tdelta == 0)
- scale = 1.0;
- else
- scale = (double)NANOSEC / tdelta;
-
- /* only toplevel vdevs have capacity stats */
- if (newvs->vs_space == 0) {
- (void) printf(" - -");
+ if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_L_HISTO_M)) {
+ /*
+ * If we specify printing histograms with no time interval, then
+ * print the histogram numbers over the entire lifetime of the
+ * vdev.
+ */
+ scale = 1;
} else {
- print_one_stat(newvs->vs_alloc);
- print_one_stat(newvs->vs_space - newvs->vs_alloc);
+ if (tdelta == 0)
+ scale = 1.0;
+ else
+ scale = (double)NANOSEC / tdelta;
}
- print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
- oldvs->vs_ops[ZIO_TYPE_READ])));
-
- print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
- oldvs->vs_ops[ZIO_TYPE_WRITE])));
-
- print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
- oldvs->vs_bytes[ZIO_TYPE_READ])));
+ if (cb->cb_flags & IOS_DEFAULT_M) {
+ calc_default_iostats(oldvs, newvs, calcvs);
+ print_iostat_default(calcvs, cb, scale);
+ }
+ if (cb->cb_flags & IOS_LATENCY_M)
+ print_iostat_latency(cb, oldnv, newnv, scale);
+ if (cb->cb_flags & IOS_QUEUES_M)
+ print_iostat_queues(cb, oldnv, newnv, scale);
+ if (cb->cb_flags & IOS_L_HISTO_M) {
+ printf("\n");
+ print_iostat_histos(cb, oldnv, newnv, scale, name);
+ }
- print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
- oldvs->vs_bytes[ZIO_TYPE_WRITE])));
+ if (!(cb->cb_flags & IOS_L_HISTO_M))
+ printf("\n");
- (void) printf("\n");
+ free(calcvs);
+ ret++;
+children:
if (!cb->cb_verbose)
- return;
+ return (ret);
if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
&newchild, &children) != 0)
- return;
+ return (ret);
if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
&oldchild, &c) != 0)
- return;
+ return (ret);
for (c = 0; c < children; c++) {
uint64_t ishole = B_FALSE, islog = B_FALSE;
@@ -2607,7 +3293,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
- print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
+ ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
}
@@ -2617,8 +3303,10 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
*/
if (num_logs(newnv) > 0) {
- (void) printf("%-*s - - - - - "
- "-\n", cb->cb_namewidth, "logs");
+ if ((!(cb->cb_flags & IOS_L_HISTO_M)) && !cb->cb_scripted &&
+ !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0, "logs");
+ }
for (c = 0; c < children; c++) {
uint64_t islog = B_FALSE;
@@ -2628,7 +3316,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
if (islog) {
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
- print_vdev_stats(zhp, vname, oldnv ?
+ ret += print_vdev_stats(zhp, vname, oldnv ?
oldchild[c] : NULL, newchild[c],
cb, depth + 2);
free(vname);
@@ -2642,23 +3330,28 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
*/
if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
&newchild, &children) != 0)
- return;
+ return (ret);
if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
&oldchild, &c) != 0)
- return;
+ return (ret);
if (children > 0) {
- (void) printf("%-*s - - - - - "
- "-\n", cb->cb_namewidth, "cache");
+ if ((!(cb->cb_flags & IOS_L_HISTO_M)) && !cb->cb_scripted &&
+ !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0, "cache");
+ }
+
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
- print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
- newchild[c], cb, depth + 2);
+ ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c]
+ : NULL, newchild[c], cb, depth + 2);
free(vname);
}
}
+
+ return (ret);
}
static int
@@ -2688,6 +3381,7 @@ print_iostat(zpool_handle_t *zhp, void *data)
iostat_cbdata_t *cb = data;
nvlist_t *oldconfig, *newconfig;
nvlist_t *oldnvroot, *newnvroot;
+ int ret;
newconfig = zpool_get_config(zhp, &oldconfig);
@@ -2703,15 +3397,13 @@ print_iostat(zpool_handle_t *zhp, void *data)
verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
&oldnvroot) == 0);
- /*
- * Print out the statistics for the pool.
- */
- print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);
+ ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot,
+ cb, 0);
+ if ((ret != 0) && !(cb->cb_flags & IOS_L_HISTO_M) && !cb->cb_scripted &&
+ cb->cb_verbose && !cb->cb_vdev_names_count)
+ print_iostat_separator(cb);
- if (cb->cb_verbose)
- print_iostat_separator(cb);
-
- return (0);
+ return (ret);
}
static int
@@ -2742,13 +3434,14 @@ get_namewidth(zpool_handle_t *zhp, void *data)
if ((config = zpool_get_config(zhp, NULL)) != NULL) {
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
+ unsigned int poolname_len = strlen(zpool_get_name(zhp));
if (!cb->cb_verbose)
- cb->cb_namewidth = strlen(zpool_get_name(zhp));
+ cb->cb_namewidth = poolname_len;
else
- cb->cb_namewidth = max_width(zhp, nvroot, 0,
- cb->cb_namewidth, cb->cb_name_flags);
+ cb->cb_namewidth = MAX(poolname_len,
+ max_width(zhp, nvroot, 0, cb->cb_namewidth,
+ cb->cb_name_flags));
}
-
/*
* The width must be at least 10, but may be as large as the
* column width - 42 so that we can still fit in one line.
@@ -2767,20 +3460,21 @@ get_namewidth(zpool_handle_t *zhp, void *data)
* Parse the input string, get the 'interval' and 'count' value if there is one.
*/
static void
-get_interval_count(int *argcp, char **argv, unsigned long *iv,
+get_interval_count(int *argcp, char **argv, float *iv,
unsigned long *cnt)
{
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
int argc = *argcp;
/*
* Determine if the last argument is an integer or a pool name
*/
- if (argc > 0 && isdigit(argv[argc - 1][0])) {
+ if (argc > 0 && isnumber(argv[argc - 1])) {
char *end;
errno = 0;
- interval = strtoul(argv[argc - 1], &end, 10);
+ interval = strtof(argv[argc - 1], &end);
if (*end == '\0' && errno == 0) {
if (interval == 0) {
@@ -2806,12 +3500,12 @@ get_interval_count(int *argcp, char **argv, unsigned long *iv,
* If the last argument is also an integer, then we have both a count
* and an interval.
*/
- if (argc > 0 && isdigit(argv[argc - 1][0])) {
+ if (argc > 0 && isnumber(argv[argc - 1])) {
char *end;
errno = 0;
count = interval;
- interval = strtoul(argv[argc - 1], &end, 10);
+ interval = strtof(argv[argc - 1], &end);
if (*end == '\0' && errno == 0) {
if (interval == 0) {
@@ -2846,12 +3540,299 @@ get_timestamp_arg(char c)
}
/*
- * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]]
+ * Return stat flags that are supported by all pools by both the module and
+ * zpool iostat. "*data" should be initialized to all 0xFFs before running.
+ * It will get ANDed down until only the flags that are supported on all pools
+ * remain.
+ */
+static int
+get_stat_flags_cb(zpool_handle_t *zhp, void *data)
+{
+ uint64_t *mask = data;
+ nvlist_t *config, *nvroot, *nvx;
+ uint64_t flags = 0;
+ int i, j;
+
+ /*
+ * Lookup table for extended iostat flags to nvlist names.
+ * Basically a list of all the nvpairs a flag requires.
+ */
+ static const char *vsx_type_to_nvlist[IOS_COUNT][10] = {
+ [IOS_L_HISTO] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ NULL},
+ [IOS_LATENCY] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ NULL},
+ [IOS_QUEUES] = {
+ ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ NULL}
+ };
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ /* Default stats are always supported, but for completeness.. */
+ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS))
+ flags |= IOS_DEFAULT_M;
+
+ /* Get our extended stats nvlist from the main list */
+ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &nvx) != 0) {
+ /*
+ * No extended stats; they're probably running an older
+ * module. No big deal, we support that too.
+ */
+ goto end;
+ }
+
+ /* For each extended stat, make sure all its nvpairs are supported */
+ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) {
+ if (!vsx_type_to_nvlist[j][0])
+ continue;
+
+ /* Start off by assuming the flag is supported, then check */
+ flags |= (1ULL << j);
+ for (i = 0; vsx_type_to_nvlist[j][i]; i++) {
+ if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) {
+ /* flag isn't supported */
+ flags = flags & ~(1ULL << j);
+ break;
+ }
+ }
+ }
+end:
+ *mask = *mask & flags;
+ return (0);
+}
+
+/*
+ * Return a bitmask of stats that are supported on all pools by both the module
+ * and zpool iostat.
+ */
+static uint64_t
+get_stat_flags(zpool_list_t *list)
+{
+ uint64_t mask = -1;
+
+ /*
+ * get_stat_flags_cb() will lop off bits from "mask" until only the
+ * flags that are supported on all pools remain.
+ */
+ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask);
+ return (mask);
+}
+
+/*
+ * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise.
+ */
+static int
+is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data)
+{
+ iostat_cbdata_t *cb = cb_data;
+ char *name;
+
+ name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags);
+
+ if (strcmp(name, cb->cb_vdev_names[0]) == 0)
+ return (1); /* match */
+
+ return (0);
+}
+
+/*
+ * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise.
+ */
+static int
+is_vdev(zpool_handle_t *zhp, void *cb_data)
+{
+ return (for_each_vdev(zhp, is_vdev_cb, cb_data));
+}
+
+/*
+ * Check if vdevs are in a pool
+ *
+ * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise
+ * return 0. If pool_name is NULL, then search all pools.
+ */
+static int
+are_vdevs_in_pool(int argc, char **argv, char *pool_name,
+ iostat_cbdata_t *cb)
+{
+ char **tmp_name;
+ int ret = 0;
+ int i;
+ int pool_count = 0;
+
+ if ((argc == 0) || !*argv)
+ return (0);
+
+ if (pool_name)
+ pool_count = 1;
+
+ /* Temporarily hijack cb_vdev_names for a second... */
+ tmp_name = cb->cb_vdev_names;
+
+ /* Go though our list of prospective vdev names */
+ for (i = 0; i < argc; i++) {
+ cb->cb_vdev_names = argv + i;
+
+ /* Is this name a vdev in our pools? */
+ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL,
+ is_vdev, cb);
+ if (!ret) {
+ /* No match */
+ break;
+ }
+ }
+
+ cb->cb_vdev_names = tmp_name;
+
+ return (ret);
+}
+
+static int
+is_pool_cb(zpool_handle_t *zhp, void *data)
+{
+ char *name = data;
+ if (strcmp(name, zpool_get_name(zhp)) == 0)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Do we have a pool named *name? If so, return 1, otherwise 0.
+ */
+static int
+is_pool(char *name)
+{
+ return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name));
+}
+
+/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */
+static int
+are_all_pools(int argc, char **argv) {
+ if ((argc == 0) || !*argv)
+ return (0);
+
+ while (--argc >= 0)
+ if (!is_pool(argv[argc]))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Helper function to print out vdev/pool names we can't resolve. Used for an
+ * error message.
+ */
+static void
+error_list_unresolved_vdevs(int argc, char **argv, char *pool_name,
+ iostat_cbdata_t *cb)
+{
+ int i;
+ char *name;
+ char *str;
+ for (i = 0; i < argc; i++) {
+ name = argv[i];
+
+ if (is_pool(name))
+ str = gettext("pool");
+ else if (are_vdevs_in_pool(1, &name, pool_name, cb))
+ str = gettext("vdev in this pool");
+ else if (are_vdevs_in_pool(1, &name, NULL, cb))
+ str = gettext("vdev in another pool");
+ else
+ str = gettext("unknown");
+
+ fprintf(stderr, "\t%s (%s)\n", name, str);
+ }
+}
+
+/*
+ * Same as get_interval_count(), but with additional checks to not misinterpret
+ * guids as interval/count values. Assumes VDEV_NAME_GUID is set in
+ * cb.cb_name_flags.
+ */
+static void
+get_interval_count_filter_guids(int *argc, char **argv, float *interval,
+ unsigned long *count, iostat_cbdata_t *cb)
+{
+ char **tmpargv = argv;
+ int argc_for_interval = 0;
+
+ /* Is the last arg an interval value? Or a guid? */
+ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) {
+ /*
+ * The last arg is not a guid, so it's probably an
+ * interval value.
+ */
+ argc_for_interval++;
+
+ if (*argc >= 2 &&
+ !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) {
+ /*
+ * The 2nd to last arg is not a guid, so it's probably
+ * an interval value.
+ */
+ argc_for_interval++;
+ }
+ }
+
+ /* Point to our list of possible intervals */
+ tmpargv = &argv[*argc - argc_for_interval];
+
+ *argc = *argc - argc_for_interval;
+ get_interval_count(&argc_for_interval, tmpargv,
+ interval, count);
+}
+
+/*
+ * Floating point sleep(). Allows you to pass in a floating point value for
+ * seconds.
+ */
+static void
+fsleep(float sec) {
+ struct timespec req;
+ req.tv_sec = floor(sec);
+ req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC;
+ nanosleep(&req, NULL);
+}
+
+
+/*
+ * zpool iostat [-ghHLpPvy] [[-lq]-w] [-n name] [-T d|u]
+ * [[ pool ...]|[pool vdev ...]|[vdev ...]]
+ * [interval [count]]
*
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
* -P Display full path for vdev name.
* -v Display statistics for individual vdevs
+ * -h Display help
+ * -p Display values in parsable (exact) format.
+ * -H Scripted mode. Don't display headers, and separate properties
+ * by a single tab.
+ * -l Display average latency
+ * -q Display queue depths
+ * -w Display histograms
* -T Display a timestamp in date(1) or Unix format
*
* This command can be tricky because we want to be able to deal with pool
@@ -2866,17 +3847,26 @@ zpool_do_iostat(int argc, char **argv)
int c;
int ret;
int npools;
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
zpool_list_t *list;
boolean_t verbose = B_FALSE;
+ boolean_t latency = B_FALSE, histo = B_FALSE;
+ boolean_t queues = B_FALSE, parseable = B_FALSE, scripted = B_FALSE;
boolean_t omit_since_boot = B_FALSE;
boolean_t guid = B_FALSE;
boolean_t follow_links = B_FALSE;
boolean_t full_name = B_FALSE;
iostat_cbdata_t cb = { 0 };
+ /* Used for printing error message */
+ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q',
+ [IOS_L_HISTO] = 'w'};
+
+ uint64_t unsupported_flags;
+
/* check options */
- while ((c = getopt(argc, argv, "gLPT:vy")) != -1) {
+ while ((c = getopt(argc, argv, "gLPT:vyhplqwH")) != -1) {
switch (c) {
case 'g':
guid = B_TRUE;
@@ -2893,9 +3883,27 @@ zpool_do_iostat(int argc, char **argv)
case 'v':
verbose = B_TRUE;
break;
+ case 'p':
+ parseable = B_TRUE;
+ break;
+ case 'l':
+ latency = B_TRUE;
+ break;
+ case 'q':
+ queues = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case 'w':
+ histo = B_TRUE;
+ break;
case 'y':
omit_since_boot = B_TRUE;
break;
+ case 'h':
+ usage(B_FALSE);
+ break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@@ -2906,7 +3914,70 @@ zpool_do_iostat(int argc, char **argv)
argc -= optind;
argv += optind;
- get_interval_count(&argc, argv, &interval, &count);
+ cb.cb_literal = parseable;
+ cb.cb_scripted = scripted;
+
+ if (guid)
+ cb.cb_name_flags |= VDEV_NAME_GUID;
+ if (follow_links)
+ cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ if (full_name)
+ cb.cb_name_flags |= VDEV_NAME_PATH;
+ cb.cb_iteration = 0;
+ cb.cb_namewidth = 0;
+ cb.cb_verbose = verbose;
+
+ /* Get our interval and count values (if any) */
+ if (guid) {
+ get_interval_count_filter_guids(&argc, argv, &interval,
+ &count, &cb);
+ } else {
+ get_interval_count(&argc, argv, &interval, &count);
+ }
+
+ if (argc == 0) {
+ /* No args, so just print the defaults. */
+ } else if (are_all_pools(argc, argv)) {
+ /* All the args are pool names */
+ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) {
+ /* All the args are vdevs */
+ cb.cb_vdev_names = argv;
+ cb.cb_vdev_names_count = argc;
+ argc = 0; /* No pools to process */
+ } else if (are_all_pools(1, argv)) {
+ /* The first arg is a pool name */
+ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) {
+ /* ...and the rest are vdev names */
+ cb.cb_vdev_names = argv + 1;
+ cb.cb_vdev_names_count = argc - 1;
+ argc = 1; /* One pool to process */
+ } else {
+ fprintf(stderr, gettext("Expected either a list of "));
+ fprintf(stderr, gettext("pools, or list of vdevs in"));
+ fprintf(stderr, " \"%s\", ", argv[0]);
+ fprintf(stderr, gettext("but got:\n"));
+ error_list_unresolved_vdevs(argc - 1, argv + 1,
+ argv[0], &cb);
+ fprintf(stderr, "\n");
+ usage(B_FALSE);
+ return (1);
+ }
+ } else {
+ /*
+ * The args don't make sense. The first arg isn't a pool name,
+ * nor are all the args vdevs.
+ */
+ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n"));
+ fprintf(stderr, "\n");
+ return (1);
+ }
+
+ if (cb.cb_vdev_names_count != 0) {
+ /*
+ * If user specified vdevs, it implies verbose.
+ */
+ cb.cb_verbose = B_TRUE;
+ }
/*
* Construct the list of all interesting pools.
@@ -2926,19 +3997,56 @@ zpool_do_iostat(int argc, char **argv)
return (1);
}
+ if (histo && (queues || latency)) {
+ pool_list_free(list);
+ (void) fprintf(stderr,
+ gettext("-w isn't allowed with [-q|-l]\n"));
+ usage(B_FALSE);
+ return (1);
+ }
+
/*
* Enter the main iostat loop.
*/
cb.cb_list = list;
- cb.cb_verbose = verbose;
- if (guid)
- cb.cb_name_flags |= VDEV_NAME_GUID;
- if (follow_links)
- cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
- if (full_name)
- cb.cb_name_flags |= VDEV_NAME_PATH;
- cb.cb_iteration = 0;
- cb.cb_namewidth = 0;
+
+ if (histo) {
+ /*
+ * Histograms tables look out of place when you try to display
+ * them with the other stats, so make a rule that you can only
+ * print histograms by themselves.
+ */
+ cb.cb_flags = IOS_L_HISTO_M;
+ } else {
+ cb.cb_flags = IOS_DEFAULT_M;
+ if (latency)
+ cb.cb_flags |= IOS_LATENCY_M;
+ if (queues)
+ cb.cb_flags |= IOS_QUEUES_M;
+ }
+
+ /*
+ * See if the module supports all the stats we want to display.
+ */
+ unsupported_flags = cb.cb_flags & ~get_stat_flags(list);
+ if (unsupported_flags) {
+ uint64_t f;
+ int idx;
+ fprintf(stderr,
+ gettext("The loaded zfs module doesn't support:"));
+
+ /* for each bit set in unsupported_flags */
+ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) {
+ idx = lowbit64(f) - 1;
+ fprintf(stderr, " -%c", flag_to_arg[idx]);
+ }
+
+ fprintf(stderr, ". Try running a newer module.\n"),
+ pool_list_free(list);
+
+ return (1);
+ }
+
for (;;) {
if ((npools = pool_list_count(list)) == 0)
@@ -2949,7 +4057,7 @@ zpool_do_iostat(int argc, char **argv)
* we skip any printing.
*/
boolean_t skip = (omit_since_boot &&
- cb.cb_iteration == 0);
+ cb.cb_iteration == 0);
/*
* Refresh all statistics. This is done as an
@@ -2958,7 +4066,7 @@ zpool_do_iostat(int argc, char **argv)
* properly accounted for.
*/
(void) pool_list_iter(list, B_FALSE, refresh_iostat,
- &cb);
+ &cb);
/*
* Iterate over all pools to determine the maximum width
@@ -2966,7 +4074,7 @@ zpool_do_iostat(int argc, char **argv)
*/
cb.cb_namewidth = 0;
(void) pool_list_iter(list, B_FALSE, get_namewidth,
- &cb);
+ &cb);
if (timestamp_fmt != NODATE)
print_timestamp(timestamp_fmt);
@@ -2974,28 +4082,38 @@ zpool_do_iostat(int argc, char **argv)
/*
* If it's the first time and we're not skipping it,
* or either skip or verbose mode, print the header.
+ *
+ * The histogram code explicitly prints its header on
+ * every vdev, so skip this for histograms.
*/
- if ((++cb.cb_iteration == 1 && !skip) ||
- (skip != verbose))
+ if (((++cb.cb_iteration == 1 && !skip) ||
+ (skip != verbose)) &&
+ (!(cb.cb_flags & IOS_L_HISTO_M)) &&
+ !cb.cb_scripted)
print_iostat_header(&cb);
if (skip) {
- (void) sleep(interval);
+ (void) fsleep(interval);
continue;
}
- (void) pool_list_iter(list, B_FALSE, print_iostat, &cb);
+ pool_list_iter(list, B_FALSE, print_iostat, &cb);
/*
* If there's more than one pool, and we're not in
* verbose mode (which prints a separator for us),
* then print a separator.
+ *
+ * In addition, if we're printing specific vdevs then
+ * we also want an ending separator.
*/
- if (npools > 1 && !verbose)
+ if (((npools > 1 && !verbose &&
+ !(cb.cb_flags & IOS_L_HISTO_M)) ||
+ (!(cb.cb_flags & IOS_L_HISTO_M) &&
+ cb.cb_vdev_names_count)) &&
+ !cb.cb_scripted) {
print_iostat_separator(&cb);
-
- if (verbose)
- (void) printf("\n");
+ }
}
/*
@@ -3010,7 +4128,7 @@ zpool_do_iostat(int argc, char **argv)
if (count != 0 && --count == 0)
break;
- (void) sleep(interval);
+ (void) fsleep(interval);
}
pool_list_free(list);
@@ -3352,7 +4470,8 @@ zpool_do_list(int argc, char **argv)
"name,size,allocated,free,expandsize,fragmentation,capacity,"
"dedupratio,health,altroot";
char *props = default_props;
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
zpool_list_t *list;
boolean_t first = B_TRUE;
@@ -3427,7 +4546,7 @@ zpool_do_list(int argc, char **argv)
break;
pool_list_free(list);
- (void) sleep(interval);
+ (void) fsleep(interval);
}
if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
@@ -4776,7 +5895,8 @@ zpool_do_status(int argc, char **argv)
{
int c;
int ret;
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
status_cbdata_t cb = { 0 };
/* check options */
@@ -4841,7 +5961,7 @@ zpool_do_status(int argc, char **argv)
if (count != 0 && --count == 0)
break;
- (void) sleep(interval);
+ (void) fsleep(interval);
}
return (0);
diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c
index c7a002efb..df3f9bf83 100644
--- a/cmd/zpool/zpool_util.c
+++ b/cmd/zpool/zpool_util.c
@@ -29,6 +29,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
+#include <ctype.h>
#include "zpool_util.h"
@@ -84,3 +85,27 @@ num_logs(nvlist_t *nv)
}
return (nlogs);
}
+
+/* Find the max element in an array of uint64_t values */
+uint64_t
+array64_max(uint64_t array[], unsigned int len) {
+ uint64_t max = 0;
+ int i;
+ for (i = 0; i < len; i++)
+ max = MAX(max, array[i]);
+
+ return (max);
+}
+
+/*
+ * Return 1 if "str" is a number string, 0 otherwise. Works for integer and
+ * floating point numbers.
+ */
+int
+isnumber(char *str) {
+ for (; *str; str++)
+ if (!(isdigit(*str) || (*str == '.')))
+ return (0);
+
+ return (1);
+}
diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h
index 1b4ce518f..f279fd5dd 100644
--- a/cmd/zpool/zpool_util.h
+++ b/cmd/zpool/zpool_util.h
@@ -38,6 +38,8 @@ extern "C" {
void *safe_malloc(size_t);
void zpool_no_memory(void);
uint_t num_logs(nvlist_t *nv);
+uint64_t array64_max(uint64_t array[], unsigned int len);
+int isnumber(char *str);
/*
* Virtual device functions
@@ -55,6 +57,10 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **,
zpool_iter_f, void *);
+/* Vdev list functions */
+typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *);
+int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data);
+
typedef struct zpool_list zpool_list_t;
zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *);
diff --git a/include/libzfs.h b/include/libzfs.h
index 3faee0add..654b93284 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -747,10 +747,21 @@ extern int zfs_unshareall(zfs_handle_t *);
extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
void *, void *, int, zfs_share_op_t);
+enum zfs_nicenum_format {
+ ZFS_NICENUM_1024 = 0,
+ ZFS_NICENUM_TIME = 1,
+ ZFS_NICENUM_RAW = 2
+};
+
/*
* Utility function to convert a number to a human-readable form.
*/
extern void zfs_nicenum(uint64_t, char *, size_t);
+extern void zfs_nicenum_format(uint64_t num, char *buf, size_t buflen,
+ enum zfs_nicenum_format type);
+
+
+extern void zfs_nicetime(uint64_t, char *, size_t);
extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
/*
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index e2974ad7a..65dba125c 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -32,6 +32,7 @@
#define _SYS_FS_ZFS_H
#include <sys/time.h>
+#include <sys/zio_priority.h>
#ifdef __cplusplus
extern "C" {
@@ -528,6 +529,37 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
+
+/* container nvlist of extended stats */
+#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex"
+
+/* Active queue read/write stats */
+#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue"
+#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
+#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
+
+/* Queue sizes */
+#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
+#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
+#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
+
+/* Latency read/write histogram stats */
+#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
+
+
+
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
@@ -766,9 +798,50 @@ typedef struct vdev_stat {
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
+
} vdev_stat_t;
/*
+ * Extended stats
+ *
+ * These are stats which aren't included in the original iostat output. For
+ * convenience, they are grouped together in vdev_stat_ex, although each stat
+ * is individually exported as a nvlist.
+ */
+typedef struct vdev_stat_ex {
+ /* Number of ZIOs issued to disk and waiting to finish */
+ uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
+
+ /* Number of ZIOs pending to be issued to disk */
+ uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
+
+ /*
+ * Below are the histograms for various latencies. Buckets are in
+ * units of nanoseconds.
+ */
+
+ /*
+ * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in
+ * before this.
+ */
+#define VDEV_HISTO_BUCKETS 37
+
+ /* Amount of time in ZIO queue (ns) */
+ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
+ [VDEV_HISTO_BUCKETS];
+
+ /* Total ZIO latency (ns). Includes queuing and disk access time */
+ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_HISTO_BUCKETS];
+
+ /* Amount of time to read/write the disk (ns) */
+ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_HISTO_BUCKETS];
+
+ /* "lookup the bucket for a value" macro */
+#define HISTO(a) (a != 0 ? MIN(highbit64(a) - 1, VDEV_HISTO_BUCKETS - 1) : 0)
+
+} vdev_stat_ex_t;
+
+/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 5abd8c019..4f54b1707 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -85,7 +85,7 @@ extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd);
-
+extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
@@ -153,6 +153,7 @@ extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
extern int vdev_label_number(uint64_t psise, uint64_t offset);
extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
+extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
typedef enum {
VDEV_LABEL_CREATE, /* create/add a new device */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 4958cad9c..0d09c81c7 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -150,6 +150,7 @@ struct vdev {
vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */
vdev_stat_t vdev_stat; /* virtual device statistics */
+ vdev_stat_ex_t vdev_stat_ex; /* extended statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
boolean_t vdev_nonrot; /* true if solid state */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index e68223eb3..693035ee2 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -647,6 +647,7 @@ extern void delay(clock_t ticks);
extern uint64_t physmem;
extern int highbit64(uint64_t i);
+extern int lowbit64(uint64_t i);
extern int random_get_bytes(uint8_t *ptr, size_t len);
extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
diff --git a/include/sys/zio.h b/include/sys/zio.h
index ced7fe87b..9790b4a90 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -421,7 +421,8 @@ struct zio {
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_delta; /* vdev queue service delta */
- uint64_t io_delay; /* vdev disk service delta (ticks) */
+ hrtime_t io_delay; /* Device access time (disk or */
+ /* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index e33b9585b..3fc3589be 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -29,8 +29,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
-
- ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
+ ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;
#ifdef __cplusplus
diff --git a/lib/libspl/include/sys/sysmacros.h b/lib/libspl/include/sys/sysmacros.h
index 5d10657be..c2525dd2a 100644
--- a/lib/libspl/include/sys/sysmacros.h
+++ b/lib/libspl/include/sys/sysmacros.h
@@ -39,6 +39,9 @@
#ifndef ABS
#define ABS(a) ((a) < 0 ? -(a) : (a))
#endif
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0]))
+#endif
#define makedevice(maj, min) makedev(maj, min)
#define _sysconf(a) sysconf(a)
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 8cacc01dd..789df407c 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -3538,7 +3538,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
* If it's a raidz device, we need to stick in the parity level.
*/
if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
-
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&value) == 0);
(void) snprintf(buf, sizeof (buf), "%s%llu", path,
@@ -3552,7 +3551,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
*/
if (name_flags & VDEV_NAME_TYPE_ID) {
uint64_t id;
-
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&id) == 0);
(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu",
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 57c2ac853..926ed4ed8 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -596,27 +596,49 @@ zfs_strdup(libzfs_handle_t *hdl, const char *str)
* Convert a number to an appropriately human-readable output.
*/
void
-zfs_nicenum(uint64_t num, char *buf, size_t buflen)
+zfs_nicenum_format(uint64_t num, char *buf, size_t buflen,
+ enum zfs_nicenum_format format)
{
uint64_t n = num;
int index = 0;
- char u;
+ const char *u;
+ const char *units[3][7] = {
+ [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"},
+ [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"}
+ };
+
+ const int units_len[] = {[ZFS_NICENUM_1024] = 6,
+ [ZFS_NICENUM_TIME] = 4};
+
+ const int k_unit[] = { [ZFS_NICENUM_1024] = 1024,
+ [ZFS_NICENUM_TIME] = 1000};
- while (n >= 1024 && index < 6) {
- n /= 1024;
+ double val;
+
+ if (format == ZFS_NICENUM_RAW) {
+ snprintf(buf, buflen, "%llu", (u_longlong_t) num);
+ return;
+ }
+
+
+ while (n >= k_unit[format] && index < units_len[format]) {
+ n /= k_unit[format];
index++;
}
- u = " KMGTPE"[index];
+ u = units[format][index];
- if (index == 0) {
- (void) snprintf(buf, buflen, "%llu", (u_longlong_t) n);
- } else if ((num & ((1ULL << 10 * index) - 1)) == 0) {
+ /* Don't print 0ns times */
+ if ((format == ZFS_NICENUM_TIME) && (num == 0)) {
+ (void) snprintf(buf, buflen, "-");
+ } else if ((index == 0) || ((num %
+ (uint64_t) powl(k_unit[format], index)) == 0)) {
/*
* If this is an even multiple of the base, always display
* without any decimal precision.
*/
- (void) snprintf(buf, buflen, "%llu%c", (u_longlong_t) n, u);
+ (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t) n, u);
+
} else {
/*
* We want to choose a precision that reflects the best choice
@@ -629,13 +651,61 @@ zfs_nicenum(uint64_t num, char *buf, size_t buflen)
*/
int i;
for (i = 2; i >= 0; i--) {
- if (snprintf(buf, buflen, "%.*f%c", i,
- (double)num / (1ULL << 10 * index), u) <= 5)
- break;
+ val = (double) num /
+ (uint64_t) powl(k_unit[format], index);
+
+ /*
+ * Don't print floating point values for time. Note,
+ * we use floor() instead of round() here, since
+ * round can result in undesirable results. For
+ * example, if "num" is in the range of
+ * 999500-999999, it will print out "1000us". This
+ * doesn't happen if we use floor().
+ */
+ if (format == ZFS_NICENUM_TIME) {
+ if (snprintf(buf, buflen, "%d%s",
+ (unsigned int) floor(val), u) <= 5)
+ break;
+
+ } else {
+ if (snprintf(buf, buflen, "%.*f%s", i,
+ val, u) <= 5)
+ break;
+ }
}
}
}
+/*
+ * Convert a number to an appropriately human-readable output.
+ */
+void
+zfs_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024);
+}
+
+/*
+ * Convert a time to an appropriately human-readable output.
+ * @num: Time in nanoseconds
+ */
+void
+zfs_nicetime(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME);
+}
+
+/*
+ * Print out a raw number with correct column spacing
+ */
+void
+zfs_niceraw(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW);
+}
+
+
+
void
libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
{
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 49d17ece3..3d85093e2 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1071,6 +1071,50 @@ highbit64(uint64_t i)
return (h);
}
+/*
+ * Find lowest one bit set.
+ * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
+ * This is basically a reimplementation of ffsll(), which is GNU specific.
+ */
+int
+lowbit64(uint64_t i)
+{
+ register int h = 64;
+ if (i == 0)
+ return (0);
+
+ if (i & 0x00000000ffffffffULL)
+ h -= 32;
+ else
+ i >>= 32;
+
+ if (i & 0x0000ffff)
+ h -= 16;
+ else
+ i >>= 16;
+
+ if (i & 0x00ff)
+ h -= 8;
+ else
+ i >>= 8;
+
+ if (i & 0x0f)
+ h -= 4;
+ else
+ i >>= 4;
+
+ if (i & 0x3)
+ h -= 2;
+ else
+ i >>= 2;
+
+ if (i & 0x1)
+ h -= 1;
+
+ return (h);
+}
+
+
static int random_fd = -1, urandom_fd = -1;
static int
diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c
index 231043d75..7a0748c03 100644
--- a/lib/libzpool/util.c
+++ b/lib/libzpool/util.c
@@ -67,7 +67,7 @@ static void
show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
{
vdev_stat_t *vs;
- vdev_stat_t v0 = { 0 };
+ vdev_stat_t *v0 = { 0 };
uint64_t sec;
uint64_t is_log = 0;
nvlist_t **child;
@@ -76,6 +76,8 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
char *prefix = "";
+ v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL);
+
if (indent == 0 && desc != NULL) {
(void) printf(" "
" capacity operations bandwidth ---- errors ----\n");
@@ -91,7 +93,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) != 0)
- vs = &v0;
+ vs = v0;
sec = MAX(1, vs->vs_timestamp / NANOSEC);
@@ -114,6 +116,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
rops, wops, rbytes, wbytes, rerr, werr, cerr);
}
+ free(v0);
if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0)
return;
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index bcbcaa249..1f14eee98 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -95,7 +95,9 @@ zpool \- configures ZFS storage pools
.LP
.nf
-\fBzpool iostat\fR [\fB-T\fR d | u ] [\fB-gLPvy\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]
+\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-G\fR|[\fB-lq\fR]]
+ [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR
+
.fi
.LP
@@ -1677,11 +1679,22 @@ Scan using the default search path, the libblkid cache will not be consulted. A
.ne 2
.mk
.na
-\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-gLPvy\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR
+\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-w\fR|[\fB-lq\fR]] [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR
+
.ad
.sp .6
.RS 4n
-Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed.
+Displays \fBI/O\fR statistics for the given \fIpool\fRs/\fIvdev\fRs. You can
+pass in a list of \fIpool\fRs, a \fIpool\fR and list of \fIvdev\fRs in that
+\fIpool\fR, or a list of any \fIvdev\fRs from any \fIpool\fR. If no items are
+specified, statistics for every pool in the system are shown. When given an
+interval, the statistics are printed every \fIinterval\fR seconds until
+\fBCtrl-C\fR is pressed. If \fIcount\fR is specified, the command exits after
+\fIcount\fR reports are printed. The first report printed is always the
+statistics since boot regardless of whether \fIinterval\fR and \fIcount\fR
+are passed. However, this behavior can be suppressed with the -y flag. Also
+note that the units of 'K', 'M', 'G'... that are printed in the report are in
+base 1024. To get the raw values, use the \fB-p\fR flag.
.sp
.ne 2
.mk
@@ -1710,6 +1723,17 @@ Display vdev GUIDs instead of the normal device names. These GUIDs can be used i
.ne 2
.mk
.na
+\fB\fB-H\fR\fR
+.ad
+.RS 12n
+.rt
+Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fB-L\fR\fR
.ad
.RS 12n
@@ -1721,6 +1745,17 @@ Display real paths for vdevs resolving all symbolic links. This can be used to l
.ne 2
.mk
.na
+\fB\fB-p\fR\fR
+.ad
+.RS 12n
+.rt
+Display numbers in parseable (exact) values. Time values are in nanoseconds.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fB-P\fR\fR
.ad
.RS 12n
@@ -1749,9 +1784,177 @@ Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within t
.rt
Omit statistics since boot. Normally the first line of output reports the statistics since boot. This option suppresses that first line of output.
.RE
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-w\fR\fR
+.ad
+.RS 12n
+.rt
+Display latency histograms:
+
+.sp
+.ne 2
+.mk
+.na
+total_wait:
+.ad
+.RS 20n
+.rt
+Total IO time (queuing + disk IO time).
+.RE
+.ne 2
+.mk
+.na
+disk_wait:
+.ad
+.RS 20n
+.rt
+Disk IO time (time reading/writing the disk).
+.RE
+.ne 2
+.mk
+.na
+syncq_wait:
+.ad
+.RS 20n
+.rt
+Amount of time IO spent in synchronous priority queues. Does not include
+disk time.
+.RE
+.ne 2
+.mk
+.na
+asyncq_wait:
+.ad
+.RS 20n
+.rt
+Amount of time IO spent in asynchronous priority queues. Does not include
+disk time.
+.RE
+.ne 2
+.mk
+.na
+scrub:
+.ad
+.RS 20n
+.rt
+Amount of time IO spent in scrub queue. Does not include disk time.
+
+
+.RE
+
+All histogram buckets are power-of-two sized. The time labels are the end
+ranges of the buckets, so for example, a 15ns bucket stores latencies from
+8-15ns. The last bucket is also a catch-all for latencies higher than the
+maximum.
+.RE
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-l\fR\fR
+.ad
+.RS 12n
+.rt
+Include average latency statistics:
+
+.sp
+.ne 2
+.mk
+.na
+total_wait:
+.ad
+.RS 20n
+.rt
+Average total IO time (queuing + disk IO time).
+.RE
+.ne 2
+.mk
+.na
+disk_wait:
+.ad
+.RS 20n
+.rt
+Average disk IO time (time reading/writing the disk).
+.RE
+.ne 2
+.mk
+.na
+syncq_wait:
+.ad
+.RS 20n
+.rt
+Average amount of time IO spent in synchronous priority queues. Does not
+include disk time.
+.RE
+.ne 2
+.mk
+.na
+asyncq_wait:
+.ad
+.RS 20n
+.rt
+Average amount of time IO spent in asynchronous priority queues. Does not
+include disk time.
+.RE
+.ne 2
+.mk
+.na
+scrub:
+.ad
+.RS 20n
+.rt
+Average queuing time in scrub queue. Does not include disk time.
+.RE
.RE
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-q\fR\fR
+.ad
+.RS 12n
+.rt
+Include active queue statistics. Each priority queue has both pending ("pend")
+and active ("activ") IOs. Pending IOs are waiting to be issued to the disk, and
+active IOs have been issued to disk and are waiting for completion. These stats
+are broken out by priority queue:
+.sp
+.ne 2
+.mk
+.na
+syncq_read/write:
+.ad
+.RS 20n
+.rt
+Current number of entries in synchronous priority queues.
+.RE
+.ne 2
+.mk
+.na
+asyncq_read/write:
+.ad
+.RS 20n
+.rt
+Current number of entries in asynchronous priority queues.
+.RE
+.ne 2
+.mk
+.na
+scrubq_read:
+.ad
+.RS 20n
+.rt
+Current number of entries in scrub queue.
+.RE
+All queue statistics are instantaneous measurements of the number of entries
+in the queues. If you specify an interval, the measurements will be sampled
+from the end of the interval.
+.RE
.sp
.ne 2
.mk
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 9246495ee..c23fd7a3a 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -3350,6 +3350,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
== 0);
vdev_get_stats(vd, vs);
+ vdev_config_generate_stats(vd, l2cache[i]);
+
}
}
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 7114c2efc..137390173 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2764,50 +2764,124 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
return (B_TRUE);
}
-/*
- * Get statistics for the given vdev.
- */
-void
-vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+static void
+vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
- spa_t *spa = vd->vdev_spa;
- vdev_t *rvd = spa->spa_root_vdev;
- int c, t;
+ int t;
+ for (t = 0; t < ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ cvs->vs_scan_removing = cvd->vdev_removing;
+}
- mutex_enter(&vd->vdev_stat_lock);
- bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
- vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf)
- vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
- vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
- if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
- vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+/*
+ * Get extended stats
+ */
+static void
+vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
+{
+ int t, b;
+ for (t = 0; t < ZIO_TYPES; t++) {
+ for (b = 0; b < VDEV_HISTO_BUCKETS; b++) {
+ vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
+ vsx->vsx_total_histo[t][b] +=
+ cvsx->vsx_total_histo[t][b];
+ }
+ }
+
+ for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+ for (b = 0; b < VDEV_HISTO_BUCKETS; b++) {
+ vsx->vsx_queue_histo[t][b] +=
+ cvsx->vsx_queue_histo[t][b];
+ }
+ vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
+ vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
}
+}
+/*
+ * Get statistics for the given vdev.
+ */
+static void
+vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+ int c, t;
/*
* If we're getting stats on the root vdev, aggregate the I/O counts
* over all top-level vdevs (i.e. the direct children of the root).
*/
- if (vd == rvd) {
- for (c = 0; c < rvd->vdev_children; c++) {
- vdev_t *cvd = rvd->vdev_child[c];
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ if (vs) {
+ memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
+ memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
+ }
+ if (vsx)
+ memset(vsx, 0, sizeof (*vsx));
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
vdev_stat_t *cvs = &cvd->vdev_stat;
+ vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
- for (t = 0; t < ZIO_TYPES; t++) {
- vs->vs_ops[t] += cvs->vs_ops[t];
- vs->vs_bytes[t] += cvs->vs_bytes[t];
- }
- cvs->vs_scan_removing = cvd->vdev_removing;
+ vdev_get_stats_ex_impl(cvd, cvs, cvsx);
+ if (vs)
+ vdev_get_child_stat(cvd, vs, cvs);
+ if (vsx)
+ vdev_get_child_stat_ex(cvd, vsx, cvsx);
+
+ }
+ } else {
+ /*
+ * We're a leaf. Just copy our ZIO active queue stats in. The
+ * other leaf stats are updated in vdev_stat_update().
+ */
+ if (!vsx)
+ return;
+
+ memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
+
+ for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
+ vsx->vsx_active_queue[t] =
+ vd->vdev_queue.vq_class[t].vqc_active;
+ vsx->vsx_pend_queue[t] = avl_numnodes(
+ &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+ }
+ }
+}
+
+void
+vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+ mutex_enter(&vd->vdev_stat_lock);
+ if (vs) {
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_rsize += VDEV_LABEL_START_SIZE +
+ VDEV_LABEL_END_SIZE;
+ vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
+ if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+ !vd->vdev_ishole) {
+ vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
}
}
+
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0);
+ vdev_get_stats_ex_impl(vd, vs, vsx);
mutex_exit(&vd->vdev_stat_lock);
}
void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ return (vdev_get_stats_ex(vd, vs, NULL));
+}
+
+void
vdev_clear_stats(vdev_t *vd)
{
mutex_enter(&vd->vdev_stat_lock);
@@ -2840,6 +2914,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_t *pvd;
uint64_t txg = zio->io_txg;
vdev_stat_t *vs = &vd->vdev_stat;
+ vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
zio_type_t type = zio->io_type;
int flags = zio->io_flags;
@@ -2890,8 +2965,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vs->vs_self_healed += psize;
}
- vs->vs_ops[type]++;
- vs->vs_bytes[type] += psize;
+ /*
+ * The bytes/ops/histograms are recorded at the leaf level and
+ * aggregated into the higher level vdevs in vdev_get_stats().
+ */
+ if (vd->vdev_ops->vdev_op_leaf) {
+
+ vs->vs_ops[type]++;
+ vs->vs_bytes[type] += psize;
+
+ if (zio->io_delta && zio->io_delay) {
+ vsx->vsx_queue_histo[zio->io_priority]
+ [HISTO(zio->io_delta - zio->io_delay)]++;
+ vsx->vsx_disk_histo[type]
+ [HISTO(zio->io_delay)]++;
+ vsx->vsx_total_histo[type]
+ [HISTO(zio->io_delta)]++;
+ }
+ }
mutex_exit(&vd->vdev_stat_lock);
return;
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 9b51ecc1d..4e362226a 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -100,9 +100,9 @@ vdev_disk_error(zio_t *zio)
{
#ifdef ZFS_DEBUG
printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
- "flags=%x delay=%llu\n", zio->io_error, zio->io_type,
+ "flags=%x\n", zio->io_error, zio->io_type,
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
- zio->io_flags, (u_longlong_t)zio->io_delay);
+ zio->io_flags);
#endif
}
@@ -410,7 +410,6 @@ vdev_disk_dio_put(dio_request_t *dr)
vdev_disk_dio_free(dr);
if (zio) {
- zio->io_delay = jiffies_64 - zio->io_delay;
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
@@ -588,8 +587,6 @@ retry:
/* Extra reference to protect dio_request during vdev_submit_bio */
vdev_disk_dio_get(dr);
- if (zio)
- zio->io_delay = jiffies_64;
/* Submit all bio's associated with this dio */
for (i = 0; i < dr->dr_bio_count; i++)
@@ -630,7 +627,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
int rc = bio->bi_error;
#endif
- zio->io_delay = jiffies_64 - zio->io_delay;
zio->io_error = -rc;
if (rc && (rc == -EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE;
@@ -660,7 +656,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
bio->bi_end_io = vdev_disk_io_flush_completion;
bio->bi_private = zio;
bio->bi_bdev = bdev;
- zio->io_delay = jiffies_64;
vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
invalidate_bdev(bdev);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 3dc3d0d9d..1400aee7b 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -208,6 +208,107 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
}
/*
+ * Generate the nvlist representing this vdev's stats
+ */
+void
+vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
+{
+ nvlist_t *nvx;
+ vdev_stat_t *vs;
+ vdev_stat_ex_t *vsx;
+
+ vs = kmem_alloc(sizeof (*vs), KM_SLEEP);
+ vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP);
+
+ vdev_get_stats_ex(vd, vs, vsx);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t));
+
+ kmem_free(vs, sizeof (*vs));
+
+ /*
+ * Add extended stats into a special extended stats nvlist. This keeps
+ * all the extended stats nicely grouped together. The extended stats
+ * nvlist is then added to the main nvlist.
+ */
+ nvx = fnvlist_alloc();
+
+ /* ZIOs in flight to disk */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
+
+ /* ZIOs pending */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
+
+ /* Histograms */
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ vsx->vsx_total_histo[ZIO_TYPE_READ],
+ ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ vsx->vsx_total_histo[ZIO_TYPE_WRITE],
+ ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ vsx->vsx_disk_histo[ZIO_TYPE_READ],
+ ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ vsx->vsx_disk_histo[ZIO_TYPE_WRITE],
+ ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
+
+ /* Add extended stats nvlist to main nvlist */
+ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
+
+ kmem_free(vsx, sizeof (*vsx));
+}
+
+/*
* Generate the nvlist representing this vdev's config.
*/
nvlist_t *
@@ -215,7 +316,6 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
-
nv = fnvlist_alloc();
fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
@@ -306,12 +406,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
}
if (getstats) {
- vdev_stat_t vs;
pool_scan_stat_t ps;
- vdev_get_stats(vd, &vs);
- fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
+ vdev_config_generate_stats(vd, nv);
/* provide either current or previous scan information */
if (spa_scan_get_stats(spa, &ps) == 0) {
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 2d16e632d..523a924d6 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -39,6 +39,7 @@
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/time.h>
/*
* ==========================================================================
@@ -2694,6 +2695,8 @@ zio_vdev_io_start(zio_t *zio)
uint64_t align;
spa_t *spa = zio->io_spa;
+ zio->io_delay = 0;
+
ASSERT(zio->io_error == 0);
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
@@ -2799,6 +2802,7 @@ zio_vdev_io_start(zio_t *zio)
}
}
+ zio->io_delay = gethrtime();
vd->vdev_ops->vdev_op_io_start(zio);
return (ZIO_PIPELINE_STOP);
}
@@ -2815,6 +2819,9 @@ zio_vdev_io_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ if (zio->io_delay)
+ zio->io_delay = gethrtime() - zio->io_delay;
+
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
vdev_queue_io_done(zio);
@@ -3217,7 +3224,7 @@ zio_done(zio_t *zio)
* 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable.
*/
- if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
+ if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 179f82e43..c9b882987 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -370,7 +370,7 @@ tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos',
[tests/functional/cli_user/zpool_iostat]
tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
- 'zpool_iostat_003_neg']
+ 'zpool_iostat_003_neg', 'zpool_iostat_004_pos']
[tests/functional/cli_user/zpool_list]
tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am
index 2c292b999..621dff91f 100644
--- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am
@@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \
cleanup.ksh \
zpool_iostat_001_neg.ksh \
zpool_iostat_002_pos.ksh \
- zpool_iostat_003_neg.ksh
+ zpool_iostat_003_neg.ksh \
+ zpool_iostat_004_pos.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh
index d275e063b..77eb6bd34 100755
--- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh
@@ -33,4 +33,4 @@
DISK=${DISKS%% *}
-default_setup $DISK
+default_raidz_setup $DISKS
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
index 37062ca53..ec5599ace 100755
--- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
@@ -33,13 +33,13 @@
#
# DESCRIPTION:
-# Verify that 'zpool iostat [interval [count]' can be executed as non-root.
+# Verify that 'zpool iostat [interval [count]]' can be executed as non-root.
#
# STRATEGY:
# 1. Set the interval to 1 and count to 4.
# 2. Sleep for 4 seconds.
# 3. Verify that the output has 4 records.
-#
+# 4. Set interval to 0.5 and count to 1 to test floating point intervals.
verify_runnable "both"
@@ -68,4 +68,7 @@ if [[ $stat_count -ne 4 ]]; then
log_fail "zpool iostat [pool_name] [interval] [count] failed"
fi
+# Test a floating point interval value
+log_must $ZPOOL iostat -v 0.5 1
+
log_pass "zpool iostat [pool_name ...] [interval] [count] passed"
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh
index d73f5d5c8..ae1e5a152 100755
--- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh
@@ -51,13 +51,14 @@ else
fi
set -A args "" "-?" "-f" "nonexistpool" "$TESTPOOL/$TESTFS" \
- "$testpool 1.23" "$testpool 0" "$testpool -1" "$testpool 1 0" \
- "$testpool 0 0"
+ "$testpool 0" "$testpool -1" "$testpool 1 0" \
+ "$testpool 0 0" "$testpool -wl" "$testpool -wq"
log_assert "Executing 'zpool iostat' with bad options fails"
typeset -i i=1
while [[ $i -lt ${#args[*]} ]]; do
+ log_assert "doing $ZPOOL iostat ${args[i]}"
log_mustnot $ZPOOL iostat ${args[i]}
((i = i + 1))
done
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh
new file mode 100755
index 000000000..70318dbb9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+# Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Executing 'zpool iostat' command with various combinations of extended
+# stats (-vqL), parseable/script options (-pH), and misc lists of pools
+# and vdevs.
+#
+# STRATEGY:
+# 1. Create an array of mixed 'zpool iostat' options.
+# 2. Execute each element of the array.
+# 3. Verify an error code is returned.
+#
+
+verify_runnable "both"
+
+typeset testpool
+if is_global_zone ; then
+ testpool=$TESTPOOL
+else
+ testpool=${TESTPOOL%%/*}
+fi
+
+set -A args "" "-v" "-q" "-l" "-lq $TESTPOOL" "-ql ${DISKS[0]} ${DISKS[1]}" \
+ "-w $TESTPOOL ${DISKS[0]} ${DISKS[1]}" \
+ "-wp $TESTPOOL" \
+ "-qlH $TESTPOOL ${DISKS[0]}" \
+ "-vpH ${DISKS[0]}" \
+ "-wpH ${DISKS[0]}"
+
+log_assert "Executing 'zpool iostat' with extended stat options succeeds"
+log_note "testpool: $TESTPOOL, disks $DISKS"
+
+typeset -i i=1
+while [[ $i -lt ${#args[*]} ]]; do
+ log_note "doing $ZPOOL iostat ${args[i]}"
+ log_must $ZPOOL iostat ${args[i]}
+ ((i = i + 1))
+done
+
+log_pass "Executing 'zpool iostat' with extended stat options succeeds"