diff options
29 files changed, 2074 insertions, 178 deletions
diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index c11951b22..b4ff106e1 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -19,4 +19,4 @@ zpool_LDADD = \ $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la \ - $(LIBBLKID) + -lm $(LIBBLKID) diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 952d19172..a18ccf29d 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -250,3 +250,69 @@ for_each_pool(int argc, char **argv, boolean_t unavail, return (ret); } + +static int +for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], func, + data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * This is the equivalent of for_each_pool() for vdevs. It iterates thorough + * all vdevs in the pool, ignoring root vdevs and holes, calling func() on + * each one. + * + * @zhp: Zpool handle + * @func: Function to call on each vdev + * @data: Custom data to pass to the function + */ +int +for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) +{ + nvlist_t *config, *nvroot; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + return (for_each_vdev_cb(zhp, nvroot, func, data)); +} diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9c7e2a0c4..6412a8e93 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -51,6 +51,7 @@ #include <sys/fm/util.h> #include <sys/fm/protocol.h> #include <sys/zfs_ioctl.h> +#include <math.h> #include <libzfs.h> @@ -144,6 +145,23 @@ typedef enum { } zpool_help_t; +/* + * Flags for stats to display with "zpool iostats" + */ +enum iostat_type { + IOS_DEFAULT = 0, + IOS_LATENCY = 1, + IOS_QUEUES = 2, + IOS_L_HISTO = 3, + IOS_COUNT, /* always last element */ +}; + +/* iostat_type entries as bitmasks */ +#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) +#define IOS_LATENCY_M (1ULL << IOS_LATENCY) +#define IOS_QUEUES_M (1ULL << IOS_QUEUES) +#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) + typedef struct zpool_command { const char *name; int (*func)(int, char **); @@ -196,7 +214,7 @@ static zpool_command_t command_table[] = { { "set", zpool_do_set, HELP_SET }, }; -#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) +#define NCOMMAND (ARRAY_SIZE(command_table)) static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; @@ -237,7 +255,8 @@ get_usage(zpool_help_t idx) { "[-R root] [-F [-n]]\n" "\t <pool | id> [newpool]\n")); case HELP_IOSTAT: - return (gettext("\tiostat [-gLPvy] [-T d|u] [pool] ... " + return (gettext("\tiostat [-T d | u] [-ghHLpPvy] [[-lq]|-w]\n" + "\t [[pool ...]|[pool vdev ...]|[vdev ...]] " "[interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] <vdev>\n")); @@ -2481,61 +2500,690 @@ error: } typedef struct iostat_cbdata { - boolean_t cb_verbose; + uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; + char **cb_vdev_names; /* Only show these vdevs */ + unsigned int cb_vdev_names_count; + boolean_t cb_verbose; + boolean_t cb_literal; + boolean_t cb_scripted; zpool_list_t *cb_list; } iostat_cbdata_t; +/* iostat labels */ +typedef struct name_and_columns { + const char *name; /* Column name */ + unsigned int columns; /* Center name to this number of columns */ +} name_and_columns_t; + +#define IOSTAT_MAX_LABELS 11 /* Max number of labels on one line */ + +static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, + {NULL}}, + [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {"scrub"}}, + [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, + {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, + {NULL}}, + [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, + {"sync_queue", 2}, {"async_queue", 2}, {NULL}}, +}; + +/* Shorthand - if "columns" field not set, default to 1 column */ +static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, + {"write"}, {NULL}}, + [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}}, + [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, + {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, + [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}}, +}; + +/* + * Return the number of labels in a null-terminated name_and_columns_t + * array. + * + */ +static unsigned int +label_array_len(const name_and_columns_t *labels) +{ + int i = 0; + + while (labels[i].name) + i++; + + return (i); +} + +/* + * Return a default column width for default/latency/queue columns. This does + * not include histograms, which have their columns autosized. + */ +static unsigned int +default_column_width(iostat_cbdata_t *cb, enum iostat_type type) +{ + unsigned long column_width = 5; /* Normal niceprint */ + static unsigned long widths[] = { + /* + * Choose some sane default column sizes for printing the + * raw numbers. + */ + [IOS_DEFAULT] = 15, /* 1PB capacity */ + [IOS_LATENCY] = 10, /* 1B ns = 10sec */ + [IOS_QUEUES] = 6, /* 1M queue entries */ + }; + + if (cb->cb_literal) + column_width = widths[type]; + + return (column_width); +} + +/* + * Print the column labels, i.e: + * + * capacity operations bandwidth + * alloc free read write read write ... + * + * If force_column_width is set, use it for the column width. If not set, use + * the default column width. + */ +void +print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, + const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) +{ + int i, idx, s; + unsigned int text_start, rw_column_width, spaces_to_end; + uint64_t flags = cb->cb_flags; + uint64_t f; + unsigned int column_width = force_column_width; + + /* For each bit set in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + if (!force_column_width) + column_width = default_column_width(cb, idx); + /* Print our top labels centered over "read write" label. */ + for (i = 0; i < label_array_len(labels[idx]); i++) { + const char *name = labels[idx][i].name; + /* + * We treat labels[][].columns == 0 as shorthand + * for one column. It makes writing out the label + * tables more concise. + */ + unsigned int columns = MAX(1, labels[idx][i].columns); + unsigned int slen = strlen(name); + + rw_column_width = (column_width * columns) + + (2 * (columns - 1)); + + text_start = (int) ((rw_column_width)/columns - + slen/columns); + + printf(" "); /* Two spaces between columns */ + + /* Space from beginning of column to label */ + for (s = 0; s < text_start; s++) + printf(" "); + + printf("%s", name); + + /* Print space after label to end of column */ + spaces_to_end = rw_column_width - text_start - slen; + for (s = 0; s < spaces_to_end; s++) + printf(" "); + + } + } + printf("\n"); +} + +/* + * Utility function to print out a line of dashes like: + * + * -------------------------------- ----- ----- ----- ----- ----- + * + * ...or a dashed named-row line like: + * + * logs - - - - - + * + * @cb: iostat data + * + * @force_column_width If non-zero, use the value as the column width. + * Otherwise use the default column widths. + * + * @name: Print a dashed named-row line starting + * with @name. Otherwise, print a regular + * dashed line. + */ +static void +print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *name) +{ + int i; + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + uint64_t f; + int idx; + const name_and_columns_t *labels; + + if (cb->cb_flags & IOS_L_HISTO_M) + namewidth = MAX(cb->cb_namewidth, strlen("latency")); + else + namewidth = cb->cb_namewidth; + + if (name) { + namewidth = MAX(cb->cb_namewidth, strlen(name)); + printf("%-*s", namewidth, name); + } else { + for (i = 0; i < namewidth; i++) + (void) printf("-"); + } + + /* For each bit in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + unsigned int column_width; + idx = lowbit64(f) - 1; + if (force_column_width) + column_width = force_column_width; + else + column_width = default_column_width(cb, idx); + + labels = iostat_bottom_labels[idx]; + for (i = 0; i < label_array_len(labels); i++) { + if (name) + printf(" %*s-", column_width - 1, " "); + else + printf(" %.*s", column_width, + "--------------------"); + } + } + printf("\n"); +} + + +static void +print_iostat_separator_impl(iostat_cbdata_t *cb, + unsigned int force_column_width) +{ + print_iostat_dashes(cb, force_column_width, NULL); +} + static void print_iostat_separator(iostat_cbdata_t *cb) { - int i = 0; + print_iostat_separator_impl(cb, 0); +} + +static void +print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *histo_vdev_name) +{ + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + + if (flags & IOS_L_HISTO_M) + namewidth = MAX(cb->cb_namewidth, strlen("latency")); + else + namewidth = cb->cb_namewidth; + + if (flags & IOS_L_HISTO_M) + printf("%-*s", namewidth, histo_vdev_name); + else + printf("%*s", namewidth, ""); - for (i = 0; i < cb->cb_namewidth; i++) - (void) printf("-"); - (void) printf(" ----- ----- ----- ----- ----- -----\n"); + print_iostat_labels(cb, force_column_width, iostat_top_labels); + + printf("%-*s", namewidth, flags & IOS_L_HISTO_M ? "latency" : + cb->cb_vdev_names_count ? "vdev" : "pool"); + + print_iostat_labels(cb, force_column_width, iostat_bottom_labels); + + print_iostat_separator_impl(cb, force_column_width); } static void print_iostat_header(iostat_cbdata_t *cb) { - (void) printf("%*s capacity operations bandwidth\n", - cb->cb_namewidth, ""); - (void) printf("%-*s alloc free read write read write\n", - cb->cb_namewidth, "pool"); - print_iostat_separator(cb); + print_iostat_header_impl(cb, 0, NULL); } + /* * Display a single statistic. */ static void -print_one_stat(uint64_t value) +print_one_stat(uint64_t value, enum zfs_nicenum_format format, + unsigned int column_size, boolean_t scripted) { char buf[64]; - zfs_nicenum(value, buf, sizeof (buf)); - (void) printf(" %5s", buf); + zfs_nicenum_format(value, buf, sizeof (buf), format); + + if (scripted) + printf("\t%s", buf); + else + printf(" %*s", column_size, buf); +} + +/* + * Calculate the default vdev stats + * + * Subtract oldvs from newvs, apply a scaling factor, and save the resulting + * stats into calcvs. + */ +static void +calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, + vdev_stat_t *calcvs) +{ + int i; + + memcpy(calcvs, newvs, sizeof (*calcvs)); + for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) + calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); + + for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) + calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); +} + +/* + * Internal representation of the extended iostats data. + * + * The extended iostat stats are exported in nvlists as either uint64_t arrays + * or single uint64_t's. We make both look like arrays to make them easier + * to process. In order to make single uint64_t's look like arrays, we set + * __data to the stat data, and then set *data = &__data with count = 1. Then, + * we can just use *data and count. + */ +struct stat_array { + uint64_t *data; + uint_t count; /* Number of entries in data[] */ + uint64_t __data; /* Only used when data is a single uint64_t */ +}; + +static uint64_t +stat_histo_max(struct stat_array *nva, unsigned int len) { + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array64_max(nva[i].data, nva[i].count)); + + return (max); +} + +/* + * Helper function to lookup a uint64_t array or uint64_t value and store its + * data as a stat_array. If the nvpair is a single uint64_t value, then we make + * it look like a one element array to make it easier to process. + */ +static int +nvpair64_to_stat_array(nvlist_t *nvl, const char *name, + struct stat_array *nva) { + nvpair_t *tmp; + int ret; + + verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); + switch (nvpair_type(tmp)) { + case DATA_TYPE_UINT64_ARRAY: + ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); + break; + case DATA_TYPE_UINT64: + ret = nvpair_value_uint64(tmp, &nva->__data); + nva->data = &nva->__data; + nva->count = 1; + break; + default: + /* Not a uint64_t */ + ret = EINVAL; + break; + } + + return (ret); +} + +/* + * Given a list of nvlist names, look up the extended stats in newnv and oldnv, + * subtract them, and return the results in a newly allocated stat_array. + * You must free the returned array after you are done with it with + * free_calc_stats(). + * + * Additionally, you can set "oldnv" to NULL if you simply want the newnv + * values. + */ +static struct stat_array * +calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, + nvlist_t *newnv) +{ + nvlist_t *oldnvx = NULL, *newnvx; + struct stat_array *oldnva, *newnva, *calcnva; + int i, j; + unsigned int alloc_size = (sizeof (struct stat_array)) * len; + + /* Extract our extended stats nvlist from the main list */ + verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &newnvx) == 0); + if (oldnv) { + verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &oldnvx) == 0); + } + + newnva = safe_malloc(alloc_size); + oldnva = safe_malloc(alloc_size); + calcnva = safe_malloc(alloc_size); + + for (j = 0; j < len; j++) { + verify(nvpair64_to_stat_array(newnvx, names[j], + &newnva[j]) == 0); + calcnva[j].count = newnva[j].count; + alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); + calcnva[j].data = safe_malloc(alloc_size); + memcpy(calcnva[j].data, newnva[j].data, alloc_size); + + if (oldnvx) { + verify(nvpair64_to_stat_array(oldnvx, names[j], + &oldnva[j]) == 0); + for (i = 0; i < oldnva[j].count; i++) + calcnva[j].data[i] -= oldnva[j].data[i]; + } + } + free(newnva); + free(oldnva); + return (calcnva); +} + +static void +free_calc_stats(struct stat_array *nva, unsigned int len) +{ + int i; + for (i = 0; i < len; i++) + free(nva[i].data); + + free(nva); +} + +static void +print_iostat_histo(struct stat_array *nva, unsigned int len, + iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, + double scale) +{ + int i, j; + char buf[6]; + uint64_t val; + enum zfs_nicenum_format format; + unsigned int buckets; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + /* All these histos are the same size, so just use nva[0].count */ + buckets = nva[0].count; + + for (j = 0; j < buckets; j++) { + /* Ending range of this bucket */ + val = (1UL << (j + 1)) - 1; + + /* Print histogram bucket label */ + zfs_nicetime(val, buf, sizeof (buf)); + if (cb->cb_scripted) + printf("%llu", (u_longlong_t) val); + else + printf("%-*s", namewidth, buf); + + /* Print the values on the line */ + for (i = 0; i < len; i++) { + print_one_stat(nva[i].data[j] * scale, format, + column_width, cb->cb_scripted); + } + printf("\n"); + } +} + +static void +print_solid_separator(unsigned int length) +{ + while (length--) + printf("-"); + printf("\n"); +} + +static void +print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale, const char *name) +{ + unsigned int column_width; + unsigned int namewidth; + unsigned int entire_width; + + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + }; + struct stat_array *nva; + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) { + column_width = MAX(5, + (unsigned int) log10(stat_histo_max(nva, + ARRAY_SIZE(names))) + 1); + } else { + column_width = 5; + } + + namewidth = MAX(cb->cb_namewidth, strlen("latency")); + + /* + * Calculate the entire line width of what we're printing. The + * +2 is for the two spaces between columns: + */ + /* read write */ + /* ----- ----- */ + /* |___| <---------- column_width */ + /* */ + /* |__________| <--- entire_width */ + /* */ + entire_width = namewidth + (column_width + 2) * + label_array_len(iostat_bottom_labels[IOS_L_HISTO]); + + if (cb->cb_scripted) + printf("%s\n", name); + else + print_iostat_header_impl(cb, column_width, name); + + print_iostat_histo(nva, ARRAY_SIZE(names), cb, column_width, + namewidth, scale); + + free_calc_stats(nva, ARRAY_SIZE(names)); + if (!cb->cb_scripted) + print_solid_separator(entire_width); +} + +/* + * Calculate the average latency of a power-of-two latency histogram + */ +static uint64_t +single_histo_average(uint64_t *histo, unsigned int buckets) +{ + int i; + uint64_t count = 0, total = 0; + + for (i = 0; i < buckets; i++) { + /* + * Our buckets are power-of-two latency ranges. Use the + * midpoint latency of each bucket to calculate the average. + * For example: + * + * Bucket Midpoint + * 8ns-15ns: 12ns + * 16ns-31ns: 24ns + * ... + */ + if (histo[i] != 0) { + total += histo[i] * (((1UL << i) + ((1UL << i)/2))); + count += histo[i]; + } + } + + /* Prevent divide by zero */ + return (count == 0 ? 0 : total / count); +} + +static void +print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + }; + + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_QUEUES); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = nva[i].data[0] * scale; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +static void +print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + }; + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_LATENCY); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_TIME; + + /* Print our avg latencies on the line */ + for (i = 0; i < ARRAY_SIZE(names); i++) { + /* Compute average latency for a latency histo */ + val = single_histo_average(nva[i].data, nva[i].count) * scale; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +/* + * Print default statistics (capacity/operations/bandwidth) + */ +static void +print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) +{ + unsigned int column_width = default_column_width(cb, IOS_DEFAULT); + enum zfs_nicenum_format format; + char na; /* char to print for "not applicable" values */ + + if (cb->cb_literal) { + format = ZFS_NICENUM_RAW; + na = '0'; + } else { + format = ZFS_NICENUM_1024; + na = '-'; + } + + /* only toplevel vdevs have capacity stats */ + if (vs->vs_space == 0) { + if (cb->cb_scripted) + printf("\t%c\t%c", na, na); + else + printf(" %*c %*c", column_width, na, column_width, + na); + } else { + print_one_stat(vs->vs_alloc, format, column_width, + cb->cb_scripted); + print_one_stat(vs->vs_space - vs->vs_alloc, format, + column_width, cb->cb_scripted); + } + + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); } /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. + * + * Returns the number of stat lines printed. */ -void +unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children; - vdev_stat_t *oldvs, *newvs; + vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; + char *vname; + int i; + int ret = 0; uint64_t tdelta; double scale; - char *vname; + + calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, @@ -2544,54 +3192,92 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, oldvs = &zerovs; } + /* Do we only want to see a specific vdev? */ + for (i = 0; i < cb->cb_vdev_names_count; i++) { + /* Yes we do. Is this the vdev? */ + if (strcmp(name, cb->cb_vdev_names[i]) == 0) { + /* + * This is our vdev. Since it is the only vdev we + * will be displaying, make depth = 0 so that it + * doesn't get indented. + */ + depth = 0; + break; + } + } + + if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { + /* Couldn't match the name */ + goto children; + } + + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); - if (strlen(name) + depth > cb->cb_namewidth) - (void) printf("%*s%s", depth, "", name); - else - (void) printf("%*s%s%*s", depth, "", name, - (int)(cb->cb_namewidth - strlen(name) - depth), ""); + /* + * Print the vdev name unless it's is a histogram. Histograms + * display the vdev name in the header itself. + */ + if (!(cb->cb_flags & IOS_L_HISTO_M)) { + if (cb->cb_scripted) { + printf("%s", name); + } else { + if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - + depth), ""); + } + } + /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; - - if (tdelta == 0) - scale = 1.0; - else - scale = (double)NANOSEC / tdelta; - - /* only toplevel vdevs have capacity stats */ - if (newvs->vs_space == 0) { - (void) printf(" - -"); + if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_L_HISTO_M)) { + /* + * If we specify printing histograms with no time interval, then + * print the histogram numbers over the entire lifetime of the + * vdev. + */ + scale = 1; } else { - print_one_stat(newvs->vs_alloc); - print_one_stat(newvs->vs_space - newvs->vs_alloc); + if (tdelta == 0) + scale = 1.0; + else + scale = (double)NANOSEC / tdelta; } - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] - - oldvs->vs_ops[ZIO_TYPE_READ]))); - - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] - - oldvs->vs_ops[ZIO_TYPE_WRITE]))); - - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] - - oldvs->vs_bytes[ZIO_TYPE_READ]))); + if (cb->cb_flags & IOS_DEFAULT_M) { + calc_default_iostats(oldvs, newvs, calcvs); + print_iostat_default(calcvs, cb, scale); + } + if (cb->cb_flags & IOS_LATENCY_M) + print_iostat_latency(cb, oldnv, newnv, scale); + if (cb->cb_flags & IOS_QUEUES_M) + print_iostat_queues(cb, oldnv, newnv, scale); + if (cb->cb_flags & IOS_L_HISTO_M) { + printf("\n"); + print_iostat_histos(cb, oldnv, newnv, scale, name); + } - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] - - oldvs->vs_bytes[ZIO_TYPE_WRITE]))); + if (!(cb->cb_flags & IOS_L_HISTO_M)) + printf("\n"); - (void) printf("\n"); + free(calcvs); + ret++; +children: if (!cb->cb_verbose) - return; + return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) - return; + return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &c) != 0) - return; + return (ret); for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; @@ -2607,7 +3293,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } @@ -2617,8 +3303,10 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ if (num_logs(newnv) > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "logs"); + if ((!(cb->cb_flags & IOS_L_HISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "logs"); + } for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; @@ -2628,7 +3316,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, if (islog) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); @@ -2642,23 +3330,28 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) - return; + return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &c) != 0) - return; + return (ret); if (children > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "cache"); + if ((!(cb->cb_flags & IOS_L_HISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "cache"); + } + for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, - newchild[c], cb, depth + 2); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] + : NULL, newchild[c], cb, depth + 2); free(vname); } } + + return (ret); } static int @@ -2688,6 +3381,7 @@ print_iostat(zpool_handle_t *zhp, void *data) iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; + int ret; newconfig = zpool_get_config(zhp, &oldconfig); @@ -2703,15 +3397,13 @@ print_iostat(zpool_handle_t *zhp, void *data) verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); - /* - * Print out the statistics for the pool. - */ - print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); + ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, + cb, 0); + if ((ret != 0) && !(cb->cb_flags & IOS_L_HISTO_M) && !cb->cb_scripted && + cb->cb_verbose && !cb->cb_vdev_names_count) + print_iostat_separator(cb); - if (cb->cb_verbose) - print_iostat_separator(cb); - - return (0); + return (ret); } static int @@ -2742,13 +3434,14 @@ get_namewidth(zpool_handle_t *zhp, void *data) if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (!cb->cb_verbose) - cb->cb_namewidth = strlen(zpool_get_name(zhp)); + cb->cb_namewidth = poolname_len; else - cb->cb_namewidth = max_width(zhp, nvroot, 0, - cb->cb_namewidth, cb->cb_name_flags); + cb->cb_namewidth = MAX(poolname_len, + max_width(zhp, nvroot, 0, cb->cb_namewidth, + cb->cb_name_flags)); } - /* * The width must be at least 10, but may be as large as the * column width - 42 so that we can still fit in one line. @@ -2767,20 +3460,21 @@ get_namewidth(zpool_handle_t *zhp, void *data) * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void -get_interval_count(int *argcp, char **argv, unsigned long *iv, +get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ - if (argc > 0 && isdigit(argv[argc - 1][0])) { + if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; - interval = strtoul(argv[argc - 1], &end, 10); + interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { @@ -2806,12 +3500,12 @@ get_interval_count(int *argcp, char **argv, unsigned long *iv, * If the last argument is also an integer, then we have both a count * and an interval. */ - if (argc > 0 && isdigit(argv[argc - 1][0])) { + if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; - interval = strtoul(argv[argc - 1], &end, 10); + interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { @@ -2846,12 +3540,299 @@ get_timestamp_arg(char c) } /* - * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]] + * Return stat flags that are supported by all pools by both the module and + * zpool iostat. "*data" should be initialized to all 0xFFs before running. + * It will get ANDed down until only the flags that are supported on all pools + * remain. + */ +static int +get_stat_flags_cb(zpool_handle_t *zhp, void *data) +{ + uint64_t *mask = data; + nvlist_t *config, *nvroot, *nvx; + uint64_t flags = 0; + int i, j; + + /* + * Lookup table for extended iostat flags to nvlist names. + * Basically a list of all the nvpairs a flag requires. + */ + static const char *vsx_type_to_nvlist[IOS_COUNT][10] = { + [IOS_L_HISTO] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + NULL}, + [IOS_LATENCY] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + NULL}, + [IOS_QUEUES] = { + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + NULL} + }; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + /* Default stats are always supported, but for completeness.. */ + if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) + flags |= IOS_DEFAULT_M; + + /* Get our extended stats nvlist from the main list */ + if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, + &nvx) != 0) { + /* + * No extended stats; they're probably running an older + * module. No big deal, we support that too. + */ + goto end; + } + + /* For each extended stat, make sure all its nvpairs are supported */ + for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { + if (!vsx_type_to_nvlist[j][0]) + continue; + + /* Start off by assuming the flag is supported, then check */ + flags |= (1ULL << j); + for (i = 0; vsx_type_to_nvlist[j][i]; i++) { + if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { + /* flag isn't supported */ + flags = flags & ~(1ULL << j); + break; + } + } + } +end: + *mask = *mask & flags; + return (0); +} + +/* + * Return a bitmask of stats that are supported on all pools by both the module + * and zpool iostat. + */ +static uint64_t +get_stat_flags(zpool_list_t *list) +{ + uint64_t mask = -1; + + /* + * get_stat_flags_cb() will lop off bits from "mask" until only the + * flags that are supported on all pools remain. + */ + pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); + return (mask); +} + +/* + * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. + */ +static int +is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +{ + iostat_cbdata_t *cb = cb_data; + char *name; + + name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); + + if (strcmp(name, cb->cb_vdev_names[0]) == 0) + return (1); /* match */ + + return (0); +} + +/* + * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. + */ +static int +is_vdev(zpool_handle_t *zhp, void *cb_data) +{ + return (for_each_vdev(zhp, is_vdev_cb, cb_data)); +} + +/* + * Check if vdevs are in a pool + * + * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise + * return 0. If pool_name is NULL, then search all pools. + */ +static int +are_vdevs_in_pool(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + char **tmp_name; + int ret = 0; + int i; + int pool_count = 0; + + if ((argc == 0) || !*argv) + return (0); + + if (pool_name) + pool_count = 1; + + /* Temporarily hijack cb_vdev_names for a second... */ + tmp_name = cb->cb_vdev_names; + + /* Go though our list of prospective vdev names */ + for (i = 0; i < argc; i++) { + cb->cb_vdev_names = argv + i; + + /* Is this name a vdev in our pools? */ + ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, + is_vdev, cb); + if (!ret) { + /* No match */ + break; + } + } + + cb->cb_vdev_names = tmp_name; + + return (ret); +} + +static int +is_pool_cb(zpool_handle_t *zhp, void *data) +{ + char *name = data; + if (strcmp(name, zpool_get_name(zhp)) == 0) + return (1); + + return (0); +} + +/* + * Do we have a pool named *name? If so, return 1, otherwise 0. + */ +static int +is_pool(char *name) +{ + return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); +} + +/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ +static int +are_all_pools(int argc, char **argv) { + if ((argc == 0) || !*argv) + return (0); + + while (--argc >= 0) + if (!is_pool(argv[argc])) + return (0); + + return (1); +} + +/* + * Helper function to print out vdev/pool names we can't resolve. Used for an + * error message. + */ +static void +error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + int i; + char *name; + char *str; + for (i = 0; i < argc; i++) { + name = argv[i]; + + if (is_pool(name)) + str = gettext("pool"); + else if (are_vdevs_in_pool(1, &name, pool_name, cb)) + str = gettext("vdev in this pool"); + else if (are_vdevs_in_pool(1, &name, NULL, cb)) + str = gettext("vdev in another pool"); + else + str = gettext("unknown"); + + fprintf(stderr, "\t%s (%s)\n", name, str); + } +} + +/* + * Same as get_interval_count(), but with additional checks to not misinterpret + * guids as interval/count values. Assumes VDEV_NAME_GUID is set in + * cb.cb_name_flags. + */ +static void +get_interval_count_filter_guids(int *argc, char **argv, float *interval, + unsigned long *count, iostat_cbdata_t *cb) +{ + char **tmpargv = argv; + int argc_for_interval = 0; + + /* Is the last arg an interval value? Or a guid? */ + if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { + /* + * The last arg is not a guid, so it's probably an + * interval value. + */ + argc_for_interval++; + + if (*argc >= 2 && + !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { + /* + * The 2nd to last arg is not a guid, so it's probably + * an interval value. + */ + argc_for_interval++; + } + } + + /* Point to our list of possible intervals */ + tmpargv = &argv[*argc - argc_for_interval]; + + *argc = *argc - argc_for_interval; + get_interval_count(&argc_for_interval, tmpargv, + interval, count); +} + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +static void +fsleep(float sec) { + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + + +/* + * zpool iostat [-ghHLpPvy] [[-lq]-w] [-n name] [-T d|u] + * [[ pool ...]|[pool vdev ...]|[vdev ...]] + * [interval [count]] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs + * -h Display help + * -p Display values in parsable (exact) format. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -l Display average latency + * -q Display queue depths + * -w Display histograms * -T Display a timestamp in date(1) or Unix format * * This command can be tricky because we want to be able to deal with pool @@ -2866,17 +3847,26 @@ zpool_do_iostat(int argc, char **argv) int c; int ret; int npools; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; zpool_list_t *list; boolean_t verbose = B_FALSE; + boolean_t latency = B_FALSE, histo = B_FALSE; + boolean_t queues = B_FALSE, parseable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; iostat_cbdata_t cb = { 0 }; + /* Used for printing error message */ + const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', + [IOS_L_HISTO] = 'w'}; + + uint64_t unsupported_flags; + /* check options */ - while ((c = getopt(argc, argv, "gLPT:vy")) != -1) { + while ((c = getopt(argc, argv, "gLPT:vyhplqwH")) != -1) { switch (c) { case 'g': guid = B_TRUE; @@ -2893,9 +3883,27 @@ zpool_do_iostat(int argc, char **argv) case 'v': verbose = B_TRUE; break; + case 'p': + parseable = B_TRUE; + break; + case 'l': + latency = B_TRUE; + break; + case 'q': + queues = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'w': + histo = B_TRUE; + break; case 'y': omit_since_boot = B_TRUE; break; + case 'h': + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2906,7 +3914,70 @@ zpool_do_iostat(int argc, char **argv) argc -= optind; argv += optind; - get_interval_count(&argc, argv, &interval, &count); + cb.cb_literal = parseable; + cb.cb_scripted = scripted; + + if (guid) + cb.cb_name_flags |= VDEV_NAME_GUID; + if (follow_links) + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + if (full_name) + cb.cb_name_flags |= VDEV_NAME_PATH; + cb.cb_iteration = 0; + cb.cb_namewidth = 0; + cb.cb_verbose = verbose; + + /* Get our interval and count values (if any) */ + if (guid) { + get_interval_count_filter_guids(&argc, argv, &interval, + &count, &cb); + } else { + get_interval_count(&argc, argv, &interval, &count); + } + + if (argc == 0) { + /* No args, so just print the defaults. */ + } else if (are_all_pools(argc, argv)) { + /* All the args are pool names */ + } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { + /* All the args are vdevs */ + cb.cb_vdev_names = argv; + cb.cb_vdev_names_count = argc; + argc = 0; /* No pools to process */ + } else if (are_all_pools(1, argv)) { + /* The first arg is a pool name */ + if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { + /* ...and the rest are vdev names */ + cb.cb_vdev_names = argv + 1; + cb.cb_vdev_names_count = argc - 1; + argc = 1; /* One pool to process */ + } else { + fprintf(stderr, gettext("Expected either a list of ")); + fprintf(stderr, gettext("pools, or list of vdevs in")); + fprintf(stderr, " \"%s\", ", argv[0]); + fprintf(stderr, gettext("but got:\n")); + error_list_unresolved_vdevs(argc - 1, argv + 1, + argv[0], &cb); + fprintf(stderr, "\n"); + usage(B_FALSE); + return (1); + } + } else { + /* + * The args don't make sense. The first arg isn't a pool name, + * nor are all the args vdevs. + */ + fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); + fprintf(stderr, "\n"); + return (1); + } + + if (cb.cb_vdev_names_count != 0) { + /* + * If user specified vdevs, it implies verbose. + */ + cb.cb_verbose = B_TRUE; + } /* * Construct the list of all interesting pools. @@ -2926,19 +3997,56 @@ zpool_do_iostat(int argc, char **argv) return (1); } + if (histo && (queues || latency)) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("-w isn't allowed with [-q|-l]\n")); + usage(B_FALSE); + return (1); + } + /* * Enter the main iostat loop. */ cb.cb_list = list; - cb.cb_verbose = verbose; - if (guid) - cb.cb_name_flags |= VDEV_NAME_GUID; - if (follow_links) - cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; - if (full_name) - cb.cb_name_flags |= VDEV_NAME_PATH; - cb.cb_iteration = 0; - cb.cb_namewidth = 0; + + if (histo) { + /* + * Histograms tables look out of place when you try to display + * them with the other stats, so make a rule that you can only + * print histograms by themselves. + */ + cb.cb_flags = IOS_L_HISTO_M; + } else { + cb.cb_flags = IOS_DEFAULT_M; + if (latency) + cb.cb_flags |= IOS_LATENCY_M; + if (queues) + cb.cb_flags |= IOS_QUEUES_M; + } + + /* + * See if the module supports all the stats we want to display. + */ + unsupported_flags = cb.cb_flags & ~get_stat_flags(list); + if (unsupported_flags) { + uint64_t f; + int idx; + fprintf(stderr, + gettext("The loaded zfs module doesn't support:")); + + /* for each bit set in unsupported_flags */ + for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + fprintf(stderr, " -%c", flag_to_arg[idx]); + } + + fprintf(stderr, ". Try running a newer module.\n"), + pool_list_free(list); + + return (1); + } + for (;;) { if ((npools = pool_list_count(list)) == 0) @@ -2949,7 +4057,7 @@ zpool_do_iostat(int argc, char **argv) * we skip any printing. */ boolean_t skip = (omit_since_boot && - cb.cb_iteration == 0); + cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an @@ -2958,7 +4066,7 @@ zpool_do_iostat(int argc, char **argv) * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, - &cb); + &cb); /* * Iterate over all pools to determine the maximum width @@ -2966,7 +4074,7 @@ zpool_do_iostat(int argc, char **argv) */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, - &cb); + &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); @@ -2974,28 +4082,38 @@ zpool_do_iostat(int argc, char **argv) /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. + * + * The histogram code explicitly prints its header on + * every vdev, so skip this for histograms. */ - if ((++cb.cb_iteration == 1 && !skip) || - (skip != verbose)) + if (((++cb.cb_iteration == 1 && !skip) || + (skip != verbose)) && + (!(cb.cb_flags & IOS_L_HISTO_M)) && + !cb.cb_scripted) print_iostat_header(&cb); if (skip) { - (void) sleep(interval); + (void) fsleep(interval); continue; } - (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); + pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. + * + * In addition, if we're printing specific vdevs then + * we also want an ending separator. */ - if (npools > 1 && !verbose) + if (((npools > 1 && !verbose && + !(cb.cb_flags & IOS_L_HISTO_M)) || + (!(cb.cb_flags & IOS_L_HISTO_M) && + cb.cb_vdev_names_count)) && + !cb.cb_scripted) { print_iostat_separator(&cb); - - if (verbose) - (void) printf("\n"); + } } /* @@ -3010,7 +4128,7 @@ zpool_do_iostat(int argc, char **argv) if (count != 0 && --count == 0) break; - (void) sleep(interval); + (void) fsleep(interval); } pool_list_free(list); @@ -3352,7 +4470,8 @@ zpool_do_list(int argc, char **argv) "name,size,allocated,free,expandsize,fragmentation,capacity," "dedupratio,health,altroot"; char *props = default_props; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; @@ -3427,7 +4546,7 @@ zpool_do_list(int argc, char **argv) break; pool_list_free(list); - (void) sleep(interval); + (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { @@ -4776,7 +5895,8 @@ zpool_do_status(int argc, char **argv) { int c; int ret; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; status_cbdata_t cb = { 0 }; /* check options */ @@ -4841,7 +5961,7 @@ zpool_do_status(int argc, char **argv) if (count != 0 && --count == 0) break; - (void) sleep(interval); + (void) fsleep(interval); } return (0); diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c index c7a002efb..df3f9bf83 100644 --- a/cmd/zpool/zpool_util.c +++ b/cmd/zpool/zpool_util.c @@ -29,6 +29,7 @@ #include <stdio.h> #include <stdlib.h> #include <strings.h> +#include <ctype.h> #include "zpool_util.h" @@ -84,3 +85,27 @@ num_logs(nvlist_t *nv) } return (nlogs); } + +/* Find the max element in an array of uint64_t values */ +uint64_t +array64_max(uint64_t array[], unsigned int len) { + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array[i]); + + return (max); +} + +/* + * Return 1 if "str" is a number string, 0 otherwise. Works for integer and + * floating point numbers. + */ +int +isnumber(char *str) { + for (; *str; str++) + if (!(isdigit(*str) || (*str == '.'))) + return (0); + + return (1); +} diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index 1b4ce518f..f279fd5dd 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -38,6 +38,8 @@ extern "C" { void *safe_malloc(size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); +uint64_t array64_max(uint64_t array[], unsigned int len); +int isnumber(char *str); /* * Virtual device functions @@ -55,6 +57,10 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, zpool_iter_f, void *); +/* Vdev list functions */ +typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); +int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); + typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); diff --git a/include/libzfs.h b/include/libzfs.h index 3faee0add..654b93284 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -747,10 +747,21 @@ extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); +enum zfs_nicenum_format { + ZFS_NICENUM_1024 = 0, + ZFS_NICENUM_TIME = 1, + ZFS_NICENUM_RAW = 2 +}; + /* * Utility function to convert a number to a human-readable form. */ extern void zfs_nicenum(uint64_t, char *, size_t); +extern void zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, + enum zfs_nicenum_format type); + + +extern void zfs_nicetime(uint64_t, char *, size_t); extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e2974ad7a..65dba125c 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -32,6 +32,7 @@ #define _SYS_FS_ZFS_H #include <sys/time.h> +#include <sys/zio_priority.h> #ifdef __cplusplus extern "C" { @@ -528,6 +529,37 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ + +/* container nvlist of extended stats */ +#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" + +/* Active queue read/write stats */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" + +/* Queue sizes */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" + +/* Latency read/write histogram stats */ +#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" + + + #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" @@ -766,9 +798,50 @@ typedef struct vdev_stat { uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ + } vdev_stat_t; /* + * Extended stats + * + * These are stats which aren't included in the original iostat output. For + * convenience, they are grouped together in vdev_stat_ex, although each stat + * is individually exported as a nvlist. + */ +typedef struct vdev_stat_ex { + /* Number of ZIOs issued to disk and waiting to finish */ + uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* Number of ZIOs pending to be issued to disk */ + uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* + * Below are the histograms for various latencies. Buckets are in + * units of nanoseconds. + */ + + /* + * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in + * before this. + */ +#define VDEV_HISTO_BUCKETS 37 + + /* Amount of time in ZIO queue (ns) */ + uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] + [VDEV_HISTO_BUCKETS]; + + /* Total ZIO latency (ns). Includes queuing and disk access time */ + uint64_t vsx_total_histo[ZIO_TYPES][VDEV_HISTO_BUCKETS]; + + /* Amount of time to read/write the disk (ns) */ + uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_HISTO_BUCKETS]; + + /* "lookup the bucket for a value" macro */ +#define HISTO(a) (a != 0 ? MIN(highbit64(a) - 1, VDEV_HISTO_BUCKETS - 1) : 0) + +} vdev_stat_ex_t; + +/* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 5abd8c019..4f54b1707 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -85,7 +85,7 @@ extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd); - +extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); @@ -153,6 +153,7 @@ extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); +extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 4958cad9c..0d09c81c7 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -150,6 +150,7 @@ struct vdev { vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ + vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index e68223eb3..693035ee2 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -647,6 +647,7 @@ extern void delay(clock_t ticks); extern uint64_t physmem; extern int highbit64(uint64_t i); +extern int lowbit64(uint64_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); diff --git a/include/sys/zio.h b/include/sys/zio.h index ced7fe87b..9790b4a90 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -421,7 +421,8 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_delta; /* vdev queue service delta */ - uint64_t io_delay; /* vdev disk service delta (ticks) */ + hrtime_t io_delay; /* Device access time (disk or */ + /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index e33b9585b..3fc3589be 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -29,8 +29,7 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_NUM_QUEUEABLE, - - ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ + ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; #ifdef __cplusplus diff --git a/lib/libspl/include/sys/sysmacros.h b/lib/libspl/include/sys/sysmacros.h index 5d10657be..c2525dd2a 100644 --- a/lib/libspl/include/sys/sysmacros.h +++ b/lib/libspl/include/sys/sysmacros.h @@ -39,6 +39,9 @@ #ifndef ABS #define ABS(a) ((a) < 0 ? -(a) : (a)) #endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif #define makedevice(maj, min) makedev(maj, min) #define _sysconf(a) sysconf(a) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 8cacc01dd..789df407c 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3538,7 +3538,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, @@ -3552,7 +3551,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, */ if (name_flags & VDEV_NAME_TYPE_ID) { uint64_t id; - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu", diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 57c2ac853..926ed4ed8 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -596,27 +596,49 @@ zfs_strdup(libzfs_handle_t *hdl, const char *str) * Convert a number to an appropriately human-readable output. */ void -zfs_nicenum(uint64_t num, char *buf, size_t buflen) +zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, + enum zfs_nicenum_format format) { uint64_t n = num; int index = 0; - char u; + const char *u; + const char *units[3][7] = { + [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"}, + [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"} + }; + + const int units_len[] = {[ZFS_NICENUM_1024] = 6, + [ZFS_NICENUM_TIME] = 4}; + + const int k_unit[] = { [ZFS_NICENUM_1024] = 1024, + [ZFS_NICENUM_TIME] = 1000}; - while (n >= 1024 && index < 6) { - n /= 1024; + double val; + + if (format == ZFS_NICENUM_RAW) { + snprintf(buf, buflen, "%llu", (u_longlong_t) num); + return; + } + + + while (n >= k_unit[format] && index < units_len[format]) { + n /= k_unit[format]; index++; } - u = " KMGTPE"[index]; + u = units[format][index]; - if (index == 0) { - (void) snprintf(buf, buflen, "%llu", (u_longlong_t) n); - } else if ((num & ((1ULL << 10 * index) - 1)) == 0) { + /* Don't print 0ns times */ + if ((format == ZFS_NICENUM_TIME) && (num == 0)) { + (void) snprintf(buf, buflen, "-"); + } else if ((index == 0) || ((num % + (uint64_t) powl(k_unit[format], index)) == 0)) { /* * If this is an even multiple of the base, always display * without any decimal precision. */ - (void) snprintf(buf, buflen, "%llu%c", (u_longlong_t) n, u); + (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t) n, u); + } else { /* * We want to choose a precision that reflects the best choice @@ -629,13 +651,61 @@ zfs_nicenum(uint64_t num, char *buf, size_t buflen) */ int i; for (i = 2; i >= 0; i--) { - if (snprintf(buf, buflen, "%.*f%c", i, - (double)num / (1ULL << 10 * index), u) <= 5) - break; + val = (double) num / + (uint64_t) powl(k_unit[format], index); + + /* + * Don't print floating point values for time. Note, + * we use floor() instead of round() here, since + * round can result in undesirable results. For + * example, if "num" is in the range of + * 999500-999999, it will print out "1000us". This + * doesn't happen if we use floor(). + */ + if (format == ZFS_NICENUM_TIME) { + if (snprintf(buf, buflen, "%d%s", + (unsigned int) floor(val), u) <= 5) + break; + + } else { + if (snprintf(buf, buflen, "%.*f%s", i, + val, u) <= 5) + break; + } } } } +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024); +} + +/* + * Convert a time to an appropriately human-readable output. + * @num: Time in nanoseconds + */ +void +zfs_nicetime(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME); +} + +/* + * Print out a raw number with correct column spacing + */ +void +zfs_niceraw(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW); +} + + + void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 49d17ece3..3d85093e2 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1071,6 +1071,50 @@ highbit64(uint64_t i) return (h); } +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * This is basically a reimplementation of ffsll(), which is GNU specific. + */ +int +lowbit64(uint64_t i) +{ + register int h = 64; + if (i == 0) + return (0); + + if (i & 0x00000000ffffffffULL) + h -= 32; + else + i >>= 32; + + if (i & 0x0000ffff) + h -= 16; + else + i >>= 16; + + if (i & 0x00ff) + h -= 8; + else + i >>= 8; + + if (i & 0x0f) + h -= 4; + else + i >>= 4; + + if (i & 0x3) + h -= 2; + else + i >>= 2; + + if (i & 0x1) + h -= 1; + + return (h); +} + + static int random_fd = -1, urandom_fd = -1; static int diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index 231043d75..7a0748c03 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -67,7 +67,7 @@ static void show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) { vdev_stat_t *vs; - vdev_stat_t v0 = { 0 }; + vdev_stat_t *v0 = { 0 }; uint64_t sec; uint64_t is_log = 0; nvlist_t **child; @@ -76,6 +76,8 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; char *prefix = ""; + v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL); + if (indent == 0 && desc != NULL) { (void) printf(" " " capacity operations bandwidth ---- errors ----\n"); @@ -91,7 +93,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) - vs = &v0; + vs = v0; sec = MAX(1, vs->vs_timestamp / NANOSEC); @@ -114,6 +116,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", rops, wops, rbytes, wbytes, rerr, werr, cerr); } + free(v0); if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) return; diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index bcbcaa249..1f14eee98 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -95,7 +95,9 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool iostat\fR [\fB-T\fR d | u ] [\fB-gLPvy\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] +\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-G\fR|[\fB-lq\fR]] + [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR + .fi .LP @@ -1677,11 +1679,22 @@ Scan using the default search path, the libblkid cache will not be consulted. A .ne 2 .mk .na -\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-gLPvy\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR +\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-w\fR|[\fB-lq\fR]] [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR + .ad .sp .6 .RS 4n -Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed. +Displays \fBI/O\fR statistics for the given \fIpool\fRs/\fIvdev\fRs. You can +pass in a list of \fIpool\fRs, a \fIpool\fR and list of \fIvdev\fRs in that +\fIpool\fR, or a list of any \fIvdev\fRs from any \fIpool\fR. If no items are +specified, statistics for every pool in the system are shown. When given an +interval, the statistics are printed every \fIinterval\fR seconds until +\fBCtrl-C\fR is pressed. If \fIcount\fR is specified, the command exits after +\fIcount\fR reports are printed. The first report printed is always the +statistics since boot regardless of whether \fIinterval\fR and \fIcount\fR +are passed. However, this behavior can be suppressed with the -y flag. Also +note that the units of 'K', 'M', 'G'... that are printed in the report are in +base 1024. To get the raw values, use the \fB-p\fR flag. .sp .ne 2 .mk @@ -1710,6 +1723,17 @@ Display vdev GUIDs instead of the normal device names. These GUIDs can be used i .ne 2 .mk .na +\fB\fB-H\fR\fR +.ad +.RS 12n +.rt +Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. +.RE + +.sp +.ne 2 +.mk +.na \fB\fB-L\fR\fR .ad .RS 12n @@ -1721,6 +1745,17 @@ Display real paths for vdevs resolving all symbolic links. This can be used to l .ne 2 .mk .na +\fB\fB-p\fR\fR +.ad +.RS 12n +.rt +Display numbers in parseable (exact) values. Time values are in nanoseconds. +.RE + +.sp +.ne 2 +.mk +.na \fB\fB-P\fR\fR .ad .RS 12n @@ -1749,9 +1784,177 @@ Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within t .rt Omit statistics since boot. Normally the first line of output reports the statistics since boot. This option suppresses that first line of output. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-w\fR\fR +.ad +.RS 12n +.rt +Display latency histograms: + +.sp +.ne 2 +.mk +.na +total_wait: +.ad +.RS 20n +.rt +Total IO time (queuing + disk IO time). +.RE +.ne 2 +.mk +.na +disk_wait: +.ad +.RS 20n +.rt +Disk IO time (time reading/writing the disk). +.RE +.ne 2 +.mk +.na +syncq_wait: +.ad +.RS 20n +.rt +Amount of time IO spent in synchronous priority queues. Does not include +disk time. +.RE +.ne 2 +.mk +.na +asyncq_wait: +.ad +.RS 20n +.rt +Amount of time IO spent in asynchronous priority queues. Does not include +disk time. +.RE +.ne 2 +.mk +.na +scrub: +.ad +.RS 20n +.rt +Amount of time IO spent in scrub queue. Does not include disk time. + + +.RE + +All histogram buckets are power-of-two sized. The time labels are the end +ranges of the buckets, so for example, a 15ns bucket stores latencies from +8-15ns. The last bucket is also a catch-all for latencies higher than the +maximum. +.RE +.sp +.ne 2 +.mk +.na +\fB\fB-l\fR\fR +.ad +.RS 12n +.rt +Include average latency statistics: + +.sp +.ne 2 +.mk +.na +total_wait: +.ad +.RS 20n +.rt +Average total IO time (queuing + disk IO time). +.RE +.ne 2 +.mk +.na +disk_wait: +.ad +.RS 20n +.rt +Average disk IO time (time reading/writing the disk). +.RE +.ne 2 +.mk +.na +syncq_wait: +.ad +.RS 20n +.rt +Average amount of time IO spent in synchronous priority queues. Does not +include disk time. +.RE +.ne 2 +.mk +.na +asyncq_wait: +.ad +.RS 20n +.rt +Average amount of time IO spent in asynchronous priority queues. Does not +include disk time. +.RE +.ne 2 +.mk +.na +scrub: +.ad +.RS 20n +.rt +Average queuing time in scrub queue. Does not include disk time. +.RE .RE +.sp +.ne 2 +.mk +.na +\fB\fB-q\fR\fR +.ad +.RS 12n +.rt +Include active queue statistics. Each priority queue has both pending ("pend") +and active ("activ") IOs. Pending IOs are waiting to be issued to the disk, and +active IOs have been issued to disk and are waiting for completion. These stats +are broken out by priority queue: +.sp +.ne 2 +.mk +.na +syncq_read/write: +.ad +.RS 20n +.rt +Current number of entries in synchronous priority queues. +.RE +.ne 2 +.mk +.na +asyncq_read/write: +.ad +.RS 20n +.rt +Current number of entries in asynchronous priority queues. +.RE +.ne 2 +.mk +.na +scrubq_read: +.ad +.RS 20n +.rt +Current number of entries in scrub queue. +.RE +All queue statistics are instantaneous measurements of the number of entries +in the queues. If you specify an interval, the measurements will be sampled +from the end of the interval. +.RE .sp .ne 2 .mk diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9246495ee..c23fd7a3a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3350,6 +3350,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); + vdev_config_generate_stats(vd, l2cache[i]); + } } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7114c2efc..137390173 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2764,50 +2764,124 @@ vdev_accessible(vdev_t *vd, zio_t *zio) return (B_TRUE); } -/* - * Get statistics for the given vdev. - */ -void -vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +static void +vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int c, t; + int t; + for (t = 0; t < ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + cvs->vs_scan_removing = cvd->vdev_removing; +} - mutex_enter(&vd->vdev_stat_lock); - bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_timestamp = gethrtime() - vs->vs_timestamp; - vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; - if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { - vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; +/* + * Get extended stats + */ +static void +vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) +{ + int t, b; + for (t = 0; t < ZIO_TYPES; t++) { + for (b = 0; b < VDEV_HISTO_BUCKETS; b++) { + vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; + vsx->vsx_total_histo[t][b] += + cvsx->vsx_total_histo[t][b]; + } + } + + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + for (b = 0; b < VDEV_HISTO_BUCKETS; b++) { + vsx->vsx_queue_histo[t][b] += + cvsx->vsx_queue_histo[t][b]; + } + vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; + vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; } +} +/* + * Get statistics for the given vdev. + */ +static void +vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) +{ + int c, t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ - if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; + if (!vd->vdev_ops->vdev_op_leaf) { + if (vs) { + memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); + memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); + } + if (vsx) + memset(vsx, 0, sizeof (*vsx)); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; + vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; - for (t = 0; t < ZIO_TYPES; t++) { - vs->vs_ops[t] += cvs->vs_ops[t]; - vs->vs_bytes[t] += cvs->vs_bytes[t]; - } - cvs->vs_scan_removing = cvd->vdev_removing; + vdev_get_stats_ex_impl(cvd, cvs, cvsx); + if (vs) + vdev_get_child_stat(cvd, vs, cvs); + if (vsx) + vdev_get_child_stat_ex(cvd, vsx, cvsx); + + } + } else { + /* + * We're a leaf. Just copy our ZIO active queue stats in. The + * other leaf stats are updated in vdev_stat_update(). + */ + if (!vsx) + return; + + memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); + + for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { + vsx->vsx_active_queue[t] = + vd->vdev_queue.vq_class[t].vqc_active; + vsx->vsx_pend_queue[t] = avl_numnodes( + &vd->vdev_queue.vq_class[t].vqc_queued_tree); + } + } +} + +void +vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) +{ + mutex_enter(&vd->vdev_stat_lock); + if (vs) { + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_rsize += VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE; + vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; + if (vd->vdev_aux == NULL && vd == vd->vdev_top && + !vd->vdev_ishole) { + vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } } + + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0); + vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + return (vdev_get_stats_ex(vd, vs, NULL)); +} + +void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); @@ -2840,6 +2914,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; + vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; @@ -2890,8 +2965,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_self_healed += psize; } - vs->vs_ops[type]++; - vs->vs_bytes[type] += psize; + /* + * The bytes/ops/histograms are recorded at the leaf level and + * aggregated into the higher level vdevs in vdev_get_stats(). + */ + if (vd->vdev_ops->vdev_op_leaf) { + + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + + if (zio->io_delta && zio->io_delay) { + vsx->vsx_queue_histo[zio->io_priority] + [HISTO(zio->io_delta - zio->io_delay)]++; + vsx->vsx_disk_histo[type] + [HISTO(zio->io_delay)]++; + vsx->vsx_total_histo[type] + [HISTO(zio->io_delta)]++; + } + } mutex_exit(&vd->vdev_stat_lock); return; diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 9b51ecc1d..4e362226a 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -100,9 +100,9 @@ vdev_disk_error(zio_t *zio) { #ifdef ZFS_DEBUG printk("ZFS: zio error=%d type=%d offset=%llu size=%llu " - "flags=%x delay=%llu\n", zio->io_error, zio->io_type, + "flags=%x\n", zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, - zio->io_flags, (u_longlong_t)zio->io_delay); + zio->io_flags); #endif } @@ -410,7 +410,6 @@ vdev_disk_dio_put(dio_request_t *dr) vdev_disk_dio_free(dr); if (zio) { - zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) @@ -588,8 +587,6 @@ retry: /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); - if (zio) - zio->io_delay = jiffies_64; /* Submit all bio's associated with this dio */ for (i = 0; i < dr->dr_bio_count; i++) @@ -630,7 +627,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc) int rc = bio->bi_error; #endif - zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = -rc; if (rc && (rc == -EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; @@ -660,7 +656,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio->bi_bdev = bdev; - zio->io_delay = jiffies_64; vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio); invalidate_bdev(bdev); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 3dc3d0d9d..1400aee7b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -208,6 +208,107 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } /* + * Generate the nvlist representing this vdev's stats + */ +void +vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) +{ + nvlist_t *nvx; + vdev_stat_t *vs; + vdev_stat_ex_t *vsx; + + vs = kmem_alloc(sizeof (*vs), KM_SLEEP); + vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); + + vdev_get_stats_ex(vd, vs, vsx); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); + + kmem_free(vs, sizeof (*vs)); + + /* + * Add extended stats into a special extended stats nvlist. This keeps + * all the extended stats nicely grouped together. The extended stats + * nvlist is then added to the main nvlist. + */ + nvx = fnvlist_alloc(); + + /* ZIOs in flight to disk */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); + + /* ZIOs pending */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); + + /* Histograms */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); + + /* Add extended stats nvlist to main nvlist */ + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); + + kmem_free(vsx, sizeof (*vsx)); +} + +/* * Generate the nvlist representing this vdev's config. */ nvlist_t * @@ -215,7 +316,6 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; - nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); @@ -306,12 +406,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, } if (getstats) { - vdev_stat_t vs; pool_scan_stat_t ps; - vdev_get_stats(vd, &vs); - fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); + vdev_config_generate_stats(vd, nv); /* provide either current or previous scan information */ if (spa_scan_get_stats(spa, &ps) == 0) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 2d16e632d..523a924d6 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -39,6 +39,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/time.h> /* * ========================================================================== @@ -2694,6 +2695,8 @@ zio_vdev_io_start(zio_t *zio) uint64_t align; spa_t *spa = zio->io_spa; + zio->io_delay = 0; + ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); @@ -2799,6 +2802,7 @@ zio_vdev_io_start(zio_t *zio) } } + zio->io_delay = gethrtime(); vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } @@ -2815,6 +2819,9 @@ zio_vdev_io_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_delay) + zio->io_delay = gethrtime() - zio->io_delay; + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_io_done(zio); @@ -3217,7 +3224,7 @@ zio_done(zio_t *zio) * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ - if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) { + if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio, 0, 0); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 179f82e43..c9b882987 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -370,7 +370,7 @@ tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am index 2c292b999..621dff91f 100644 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am @@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_iostat_001_neg.ksh \ zpool_iostat_002_pos.ksh \ - zpool_iostat_003_neg.ksh + zpool_iostat_003_neg.ksh \ + zpool_iostat_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh index d275e063b..77eb6bd34 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh @@ -33,4 +33,4 @@ DISK=${DISKS%% *} -default_setup $DISK +default_raidz_setup $DISKS diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh index 37062ca53..ec5599ace 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh @@ -33,13 +33,13 @@ # # DESCRIPTION: -# Verify that 'zpool iostat [interval [count]' can be executed as non-root. +# Verify that 'zpool iostat [interval [count]]' can be executed as non-root. # # STRATEGY: # 1. Set the interval to 1 and count to 4. # 2. Sleep for 4 seconds. # 3. Verify that the output has 4 records. -# +# 4. Set interval to 0.5 and count to 1 to test floating point intervals. verify_runnable "both" @@ -68,4 +68,7 @@ if [[ $stat_count -ne 4 ]]; then log_fail "zpool iostat [pool_name] [interval] [count] failed" fi +# Test a floating point interval value +log_must $ZPOOL iostat -v 0.5 1 + log_pass "zpool iostat [pool_name ...] [interval] [count] passed" diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh index d73f5d5c8..ae1e5a152 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh @@ -51,13 +51,14 @@ else fi set -A args "" "-?" "-f" "nonexistpool" "$TESTPOOL/$TESTFS" \ - "$testpool 1.23" "$testpool 0" "$testpool -1" "$testpool 1 0" \ - "$testpool 0 0" + "$testpool 0" "$testpool -1" "$testpool 1 0" \ + "$testpool 0 0" "$testpool -wl" "$testpool -wq" log_assert "Executing 'zpool iostat' with bad options fails" typeset -i i=1 while [[ $i -lt ${#args[*]} ]]; do + log_assert "doing $ZPOOL iostat ${args[i]}" log_mustnot $ZPOOL iostat ${args[i]} ((i = i + 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh new file mode 100755 index 000000000..70318dbb9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +# Copyright (C) 2016 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Executing 'zpool iostat' command with various combinations of extended +# stats (-vqL), parseable/script options (-pH), and misc lists of pools +# and vdevs. +# +# STRATEGY: +# 1. Create an array of mixed 'zpool iostat' options. +# 2. Execute each element of the array. +# 3. Verify an error code is returned. +# + +verify_runnable "both" + +typeset testpool +if is_global_zone ; then + testpool=$TESTPOOL +else + testpool=${TESTPOOL%%/*} +fi + +set -A args "" "-v" "-q" "-l" "-lq $TESTPOOL" "-ql ${DISKS[0]} ${DISKS[1]}" \ + "-w $TESTPOOL ${DISKS[0]} ${DISKS[1]}" \ + "-wp $TESTPOOL" \ + "-qlH $TESTPOOL ${DISKS[0]}" \ + "-vpH ${DISKS[0]}" \ + "-wpH ${DISKS[0]}" + +log_assert "Executing 'zpool iostat' with extended stat options succeeds" +log_note "testpool: $TESTPOOL, disks $DISKS" + +typeset -i i=1 +while [[ $i -lt ${#args[*]} ]]; do + log_note "doing $ZPOOL iostat ${args[i]}" + log_must $ZPOOL iostat ${args[i]} + ((i = i + 1)) +done + +log_pass "Executing 'zpool iostat' with extended stat options succeeds" |