diff options
author | Don Brady <[email protected]> | 2018-09-05 19:33:36 -0600 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-09-05 18:33:36 -0700 |
commit | cc99f275a28c43fe450a66a7544f73c4935f7361 (patch) | |
tree | f867e1d2cbb550a047c0f87986831252c41a2fd9 /cmd | |
parent | cfa37548ebc880580782b245f2d233ed540e7a01 (diff) |
Pool allocation classes
Allocation Classes add the ability to have allocation classes in a
pool that are dedicated to serving specific block categories, such
as DDT data, metadata, and small file blocks. A pool can opt-in to
this feature by adding a 'special' or 'dedup' top-level VDEV.
Reviewed by: Pavel Zakharov <[email protected]>
Reviewed-by: Richard Laager <[email protected]>
Reviewed-by: Alek Pinchuk <[email protected]>
Reviewed-by: HÃ¥kan Johansson <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
Reviewed-by: DHE <[email protected]>
Reviewed-by: Richard Elling <[email protected]>
Reviewed-by: Gregor Kopka <[email protected]>
Reviewed-by: Kash Pande <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Closes #5182
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/zdb/zdb.c | 128 | ||||
-rw-r--r-- | cmd/zpool/zpool_main.c | 285 | ||||
-rw-r--r-- | cmd/zpool/zpool_vdev.c | 113 | ||||
-rw-r--r-- | cmd/ztest/ztest.c | 187 |
4 files changed, 582 insertions, 131 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 38ffbbece..21113da2f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. @@ -932,13 +932,23 @@ dump_metaslab(metaslab_t *msp) static void print_vdev_metaslab_header(vdev_t *vd) { - (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", - (u_longlong_t)vd->vdev_id, + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *bias_str; + + bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ? + VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : + vd->vdev_islog ? "log" : ""; + + (void) printf("\tvdev %10llu %s\n" + "\t%-10s%5llu %-19s %-15s %-12s\n", + (u_longlong_t)vd->vdev_id, bias_str, "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); - (void) printf("\t%15s %19s %15s %10s\n", + (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", - "---------------", "-------------"); + "---------------", "------------"); } static void @@ -954,7 +964,7 @@ dump_metaslab_groups(spa_t *spa) vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; - if (mg->mg_class != mc) + if (mg == NULL || mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); @@ -3158,6 +3168,7 @@ typedef struct zdb_blkstats { uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; + uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; @@ -3197,6 +3208,16 @@ typedef struct zdb_cb { uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; +/* test if two DVA offsets from same vdev are within the same metaslab */ +static boolean_t +same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t ms_shift = vd->vdev_ms_shift; + + return ((off1 >> ms_shift) == (off2 >> ms_shift)); +} + static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) @@ -3209,6 +3230,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -3234,8 +3257,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; + + if (same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == @@ -3244,13 +3274,37 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal != 0) + if (equal != 0) { zb->zb_ditto_samevdev++; + + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + } break; } - } + spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] @@ -4103,6 +4157,8 @@ dump_block_stats(spa_t *spa) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); @@ -4147,7 +4203,10 @@ dump_block_stats(spa_t *spa) norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); - total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); + total_alloc = norm_alloc + + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_special_class(spa)) + + metaslab_class_get_alloc(spa_dedup_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; @@ -4169,31 +4228,50 @@ dump_block_stats(spa_t *spa) return (2); (void) printf("\n"); - (void) printf("\tbp count: %10llu\n", + (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); - (void) printf("\tganged count: %10llu\n", + (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); - (void) printf("\tbp logical: %10llu avg: %6llu\n", + (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); - (void) printf("\tbp physical: %10llu avg:" - " %6llu compression: %6.2f\n", - (u_longlong_t)tzb->zb_psize, + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); - (void) printf("\tbp allocated: %10llu avg:" - " %6llu compression: %6.2f\n", - (u_longlong_t)tzb->zb_asize, + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); - (void) printf("\tbp deduped: %10llu ref>1:" - " %6llu deduplication: %6.2f\n", - (u_longlong_t)zcb.zcb_dedup_asize, + (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", + "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); - (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", + (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + if (spa_special_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + if (spa_dedup_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_dedup_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_dedup_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Dedup class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; @@ -4215,6 +4293,10 @@ dump_block_stats(spa_t *spa) (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } + if (tzb->zb_ditto_same_ms != 0) { + (void) printf("\tDittoed blocks in same metaslab: %llu\n", + (longlong_t)tzb->zb_ditto_same_ms); + } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 73744a87c..37124566e 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -22,13 +22,14 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov <[email protected]>. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. */ #include <assert.h> @@ -291,6 +292,8 @@ static zpool_command_t command_table[] = { #define NCOMMAND (ARRAY_SIZE(command_table)) +#define VDEV_ALLOC_CLASS_LOGS "logs" + static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; @@ -393,7 +396,7 @@ print_prop_cb(int prop, void *cb) { FILE *fp = cb; - (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop)); + (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); @@ -445,14 +448,14 @@ usage(boolean_t requested) (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); - (void) fprintf(fp, "\n\t%-15s %s %s\n\n", + (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); - (void) fprintf(fp, "\t%-15s ", "feature@..."); + (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " @@ -470,32 +473,45 @@ usage(boolean_t requested) exit(requested ? 0 : 2); } -void +/* + * print a pool vdev config for dry runs + */ +static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, - boolean_t print_logs, int name_flags) + const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; - - if (name != NULL) - (void) printf("\t%*s%s\n", indent, "", name); + boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) + &child, &children) != 0) { + if (name != NULL) + (void) printf("\t%*s%s\n", indent, "", name); return; + } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; + char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); - if ((is_log && !print_logs) || (!is_log && print_logs)) + if (is_log) + class = VDEV_ALLOC_BIAS_LOG; + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &class); + if (strcmp(match, class) != 0) continue; + if (!printed && name != NULL) { + (void) printf("\t%*s%s\n", indent, "", name); + printed = B_TRUE; + } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); - print_vdev_tree(zhp, vname, child[c], indent + 2, - B_FALSE, name_flags); + print_vdev_tree(zhp, vname, child[c], indent + 2, "", + name_flags); free(vname); } } @@ -734,20 +750,25 @@ zpool_do_add(int argc, char **argv) "configuration:\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ - print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE, + print_vdev_tree(zhp, poolname, poolnvroot, 0, "", + name_flags | VDEV_NAME_TYPE_ID); + print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); + + /* print other classes: 'dedup', 'special', and 'log' */ + print_vdev_tree(zhp, "dedup", poolnvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE, name_flags); - /* Do the same for the logs */ - if (num_logs(poolnvroot) > 0) { - print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE, - name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE, - name_flags); - } else if (num_logs(nvroot) > 0) { - print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE, - name_flags); - } + print_vdev_tree(zhp, "special", poolnvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, + name_flags); + + print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, + name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, + name_flags); /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, @@ -1305,9 +1326,13 @@ zpool_do_create(int argc, char **argv) (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); - print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE, 0); - if (num_logs(nvroot) > 0) - print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE, 0); + print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); + print_vdev_tree(NULL, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); + print_vdev_tree(NULL, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, 0); ret = 0; } else { @@ -1865,6 +1890,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, &ishole); if (islog || ishole) continue; + /* Only print normal classes here */ + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, @@ -1951,6 +1980,8 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, &is_log); if (is_log) continue; + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); @@ -1982,34 +2013,56 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, } /* - * Print log vdevs. - * Logs are recorded as top level vdevs in the main pool child array - * but with "is_log" set to 1. We use either print_status_config() or - * print_import_config() to print the top level logs then any log - * children (eg mirrored slogs) are printed recursively - which - * works because only the top level vdev is marked "is_log" + * Print specialized class vdevs. + * + * These are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1 or an "alloc_bias" string. We use either + * print_status_config() or print_import_config() to print the top level + * class vdevs then any of their children (eg mirrored slogs) are printed + * recursively - which works because only the top level vdev is marked. */ static void -print_logs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv) +print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, + const char *class) { uint_t c, children; nvlist_t **child; + boolean_t printed = B_FALSE; + + assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; - (void) printf(gettext("\tlogs\n")); - for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; - char *name; + char *bias = NULL; + char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); - if (!is_log) + + if (is_log) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + + if (bias == NULL || strcmp(bias, class) != 0) + continue; + if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; - name = zpool_vdev_name(g_zfs, zhp, child[c], + + if (!printed) { + (void) printf("\t%s\t\n", gettext(class)); + printed = B_TRUE; + } + + char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, @@ -2313,8 +2366,10 @@ show_import(nvlist_t *config) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); - if (num_logs(nvroot) > 0) - print_logs(NULL, &cb, nvroot); + + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " @@ -3810,6 +3865,12 @@ print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) format, column_width, cb->cb_scripted); } +static const char *class_name[] = { + VDEV_ALLOC_BIAS_DEDUP, + VDEV_ALLOC_BIAS_SPECIAL, + VDEV_ALLOC_CLASS_LOGS +}; + /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this @@ -3817,7 +3878,7 @@ print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) * * Returns the number of stat lines printed. */ -unsigned int +static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { @@ -3945,6 +4006,9 @@ children: children = MIN(oldchildren, children); } + /* + * print normal top-level devices + */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; @@ -3957,6 +4021,9 @@ children: if (ishole || islog) continue; + if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, @@ -3965,31 +4032,47 @@ children: } /* - * Log device section + * print all other top-level devices */ - - if (num_logs(newnv) > 0) { - if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && - !cb->cb_vdev_names) { - print_iostat_dashes(cb, 0, "logs"); - } - printf("\n"); + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; + char *bias = NULL; + char *type = NULL; + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); - if (islog) { - vname = zpool_vdev_name(g_zfs, zhp, newchild[c], - cb->cb_name_flags); - ret += print_vdev_stats(zhp, vname, oldnv ? - oldchild[c] : NULL, newchild[c], - cb, depth + 2); - free(vname); + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_TYPE, &type); } - } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) + continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + if (!printed) { + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && + !cb->cb_scripted && !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, + class_name[n]); + } + printf("\n"); + printed = B_TRUE; + } + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? + oldchild[c] : NULL, newchild[c], cb, depth + 2); + free(vname); + } } /* @@ -4937,6 +5020,7 @@ typedef struct list_cbdata { boolean_t cb_literal; } list_cbdata_t; + /* * Given a list of columns to display, output appropriate headers for each one. */ @@ -4991,7 +5075,7 @@ print_header(list_cbdata_t *cb) /* * Given a pool and a list of properties, print out all the properties according - * to the described layout. + * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) @@ -5087,12 +5171,14 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, } break; case ZPOOL_PROP_CAPACITY: + /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", - (unsigned long long)value); + (unsigned long long)value / 100); else - (void) snprintf(propval, sizeof (propval), "%llu%%", - (unsigned long long)value); + (void) snprintf(propval, sizeof (propval), + value < 1000 ? "%1.2f%%" : value < 10000 ? + "%2.1f%%" : "%3.0f%%", value / 100.0); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); @@ -5107,6 +5193,10 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, (void) printf(" %*s", (int)width, propval); } +/* + * print static default line per vdev + * not compatible with '-o' <proplist> option + */ void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth) @@ -5117,7 +5207,6 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; - boolean_t haslog = B_FALSE; char *dashes = "%-*s - - - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, @@ -5165,7 +5254,7 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : - (vs->vs_alloc * 100 / vs->vs_space); + (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel, format); (void) printf("\n"); @@ -5175,6 +5264,7 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, &child, &children) != 0) return; + /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; @@ -5183,10 +5273,11 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, continue; if (nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { - haslog = B_TRUE; + ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) + continue; + + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; - } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); @@ -5194,13 +5285,34 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, free(vname); } - if (haslog == B_TRUE) { - /* LINTED E_SEC_PRINTF_VAR_FMT */ - (void) printf(dashes, cb->cb_namewidth, "log"); + /* list the classes: 'logs', 'dedup', and 'special' */ + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; + for (c = 0; c < children; c++) { + char *bias = NULL; + char *type = NULL; + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &islog) != 0 || !islog) + &islog) == 0 && islog) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, + class_name[n]); + printed = B_TRUE; + } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); @@ -5233,7 +5345,6 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, } } - /* * Generic callback function to list a pool. */ @@ -5246,13 +5357,21 @@ list_callback(zpool_handle_t *zhp, void *data) config = zpool_get_config(zhp, NULL); + if (cbp->cb_verbose) { + config = zpool_get_config(zhp, NULL); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + + if (cbp->cb_verbose) + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, + cbp->cb_name_flags); + print_pool(zhp, cbp); - if (!cbp->cb_verbose) - return (0); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - print_list_stats(zhp, NULL, nvroot, cbp, 0); + if (cbp->cb_verbose) + print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } @@ -5315,6 +5434,7 @@ zpool_do_list(int argc, char **argv) break; case 'v': cb.cb_verbose = B_TRUE; + cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -5716,7 +5836,7 @@ zpool_do_split(int argc, char **argv) if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); - print_vdev_tree(NULL, newpool, config, 0, B_FALSE, + print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); } } @@ -6965,11 +7085,14 @@ status_callback(zpool_handle_t *zhp, void *data) print_cmd_columns(cbp->vcdl, 0); printf("\n"); + print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE); - if (num_logs(nvroot) > 0) - print_logs(zhp, cbp, nvroot); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index e61655f28..37f1ca1d9 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - * Copyright (c) 2016 Intel Corporation. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. * Copyright 2016 Igor Kozhukhov <[email protected]>. */ @@ -684,6 +684,9 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -742,6 +745,9 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) * * Otherwise, make sure that the current spec (if there is one) and the new * spec have consistent replication levels. + * + * If there is no current spec (create), make sure new spec has at least + * one general purpose vdev. */ typedef struct replication_level { char *zprl_type; @@ -965,7 +971,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) /* * At this point, we have the replication of the last toplevel - * vdev in 'rep'. Compare it to 'lastrep' to see if its + * vdev in 'rep'. Compare it to 'lastrep' to see if it is * different. */ if (lastrep.zprl_type != NULL) { @@ -1466,6 +1472,13 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (VDEV_TYPE_LOG); } + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || + strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + if (mindev != NULL) + *mindev = 1; + return (type); + } + if (strcmp(type, "cache") == 0) { if (mindev != NULL) *mindev = 1; @@ -1487,7 +1500,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; const char *type; - uint64_t is_log; + uint64_t is_log, is_special, is_dedup; boolean_t seen_logs; top = NULL; @@ -1497,7 +1510,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = B_FALSE; + is_log = is_special = is_dedup = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; @@ -1520,7 +1533,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = B_FALSE; + is_log = is_special = is_dedup = B_FALSE; } if (strcmp(type, VDEV_TYPE_LOG) == 0) { @@ -1533,6 +1546,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; + is_special = B_FALSE; + is_dedup = B_FALSE; argc--; argv++; /* @@ -1542,6 +1557,24 @@ construct_spec(nvlist_t *props, int argc, char **argv) continue; } + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { + is_special = B_TRUE; + is_log = B_FALSE; + is_dedup = B_FALSE; + argc--; + argv++; + continue; + } + + if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + is_dedup = B_TRUE; + is_log = B_FALSE; + is_special = B_FALSE; + argc--; + argv++; + continue; + } + if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { if (l2cache != NULL) { (void) fprintf(stderr, @@ -1550,15 +1583,16 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = B_FALSE; + is_log = is_special = is_dedup = B_FALSE; } - if (is_log) { + if (is_log || is_special || is_dedup) { if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { (void) fprintf(stderr, gettext("invalid vdev " - "specification: unsupported 'log' " - "device: %s\n"), type); + "specification: unsupported '%s' " + "device: %s\n"), is_log ? "log" : + "special", type); goto spec_out; } nlogs++; @@ -1615,12 +1649,27 @@ construct_spec(nvlist_t *props, int argc, char **argv) nl2cache = children; continue; } else { + /* create a top-level vdev with children */ verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, @@ -1645,6 +1694,16 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (is_log) nlogs++; + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } argc--; argv++; } @@ -1745,6 +1804,30 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, return (newroot); } +static int +num_normal_vdevs(nvlist_t *nvroot) +{ + nvlist_t **top; + uint_t t, toplevels, normal = 0; + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + for (t = 0; t < toplevels; t++) { + uint64_t log = B_FALSE; + + (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); + if (log) + continue; + if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + normal++; + } + + return (normal); +} + /* * Get and validate the contents of the given vdev specification. This ensures * that the nvlist returned is well-formed, that all the devices exist, and that @@ -1798,6 +1881,16 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, } /* + * On pool create the new vdev spec must have one normal vdev. + */ + if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { + vdev_error(gettext("at least one general top-level vdev must " + "be specified\n")); + nvlist_free(newroot); + return (NULL); + } + + /* * Run through the vdev specification and label any whole disks found. */ if (!dryrun && make_disks(zhp, newroot) != 0) { diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 71d5ed646..83d057a74 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -20,11 +20,12 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ /* @@ -149,6 +150,12 @@ typedef struct ztest_shared_hdr { static ztest_shared_hdr_t *ztest_shared_hdr; +enum ztest_class_state { + ZTEST_VDEV_CLASS_OFF, + ZTEST_VDEV_CLASS_ON, + ZTEST_VDEV_CLASS_RND +}; + typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; @@ -171,6 +178,7 @@ typedef struct ztest_shared_opts { uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; int zo_mmp_test; + int zo_special_vdevs; } ztest_shared_opts_t; static const ztest_shared_opts_t ztest_opts_defaults = { @@ -194,6 +202,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ .zo_metaslab_force_ganging = 32 << 10, + .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, }; extern uint64_t metaslab_force_ganging; @@ -342,6 +351,7 @@ ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; +ztest_func_t ztest_vdev_class_add; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; @@ -398,6 +408,7 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), ZTI_INIT(ztest_remap_blocks, 1, &zopt_sometimes), @@ -666,6 +677,7 @@ usage(boolean_t requested) "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" "\t[-P passtime (default: %llu sec)] time per pass\n" "\t[-B alt_ztest (default: <none>)] alternate ztest path\n" + "\t[-C vdev class state (default: random)] special=on|off|random\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" @@ -691,6 +703,46 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } + +static void +ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) +{ + char name[32]; + char *value; + int state = ZTEST_VDEV_CLASS_RND; + + (void) strlcpy(name, input, sizeof (name)); + + value = strchr(name, '='); + if (value == NULL) { + (void) fprintf(stderr, "missing value in property=value " + "'-C' argument (%s)\n", input); + usage(B_FALSE); + } + *(value) = '\0'; + value++; + + if (strcmp(value, "on") == 0) { + state = ZTEST_VDEV_CLASS_ON; + } else if (strcmp(value, "off") == 0) { + state = ZTEST_VDEV_CLASS_OFF; + } else if (strcmp(value, "random") == 0) { + state = ZTEST_VDEV_CLASS_RND; + } else { + (void) fprintf(stderr, "invalid property value '%s'\n", value); + usage(B_FALSE); + } + + if (strcmp(name, "special") == 0) { + zo->zo_special_vdevs = state; + } else { + (void) fprintf(stderr, "invalid property name '%s'\n", name); + usage(B_FALSE); + } + if (zo->zo_verbose >= 3) + (void) printf("%s vdev state is '%s'\n", name, value); +} + static void process_options(int argc, char **argv) { @@ -704,7 +756,7 @@ process_options(int argc, char **argv) bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:G")) != EOF) { + "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { value = 0; switch (opt) { case 'v': @@ -795,6 +847,9 @@ process_options(int argc, char **argv) case 'B': (void) strlcpy(altdir, optarg, sizeof (altdir)); break; + case 'C': + ztest_parse_name_value(optarg, zo); + break; case 'o': if (set_global_var(optarg) != 0) usage(B_FALSE); @@ -1022,13 +1077,16 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, static nvlist_t * make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, - int log, int r, int m, int t) + const char *class, int r, int m, int t) { nvlist_t *root, **child; int c; + boolean_t log; ASSERT(t > 0); + log = (class != NULL && strcmp(class, "log") == 0); + child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { @@ -1036,6 +1094,12 @@ make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, r, m); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log) == 0); + + if (class != NULL && class[0] != '\0') { + ASSERT(m > 1 || log); /* expecting a mirror */ + VERIFY(nvlist_add_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); + } } VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); @@ -1075,6 +1139,8 @@ ztest_random_spa_version(uint64_t initial_version) static int ztest_random_blocksize(void) { + ASSERT(ztest_spa->spa_max_ashift != 0); + /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. @@ -2722,7 +2788,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Attempt to create using a bad file. */ - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); @@ -2730,7 +2796,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Attempt to create using a bad mirror. */ - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); @@ -2740,7 +2806,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) * what's in the nvroot; we should fail with EEXIST. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); @@ -2816,7 +2882,7 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the @@ -2990,10 +3056,16 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { + metaslab_group_t *mg; + /* - * Grab the guid from the head of the log class rotor. + * find the first real slog in log allocation class */ - guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; + mg = spa_log_class(spa)->mc_rotor; + while (!mg->mg_vd->vdev_islog) + mg = mg->mg_next; + + guid = mg->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); @@ -3024,12 +3096,11 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_VDEV, FTAG); /* - * Make 1/4 of the devices be log devices. + * Make 1/4 of the devices be log devices */ nvroot = make_vdev_root(NULL, NULL, NULL, - ztest_opts.zo_vdev_size, 0, - ztest_random(4) == 0, ztest_opts.zo_raidz, - zs->zs_mirrors, 1); + ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? + "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3048,6 +3119,83 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_vdev_lock); } +/* ARGSUSED */ +void +ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves; + nvlist_t *nvroot; + const char *class = (ztest_random(2) == 0) ? + VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; + int error; + + /* + * By default add a special vdev 50% of the time + */ + if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || + (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && + ztest_random(2) == 0)) { + return; + } + + mutex_enter(&ztest_vdev_lock); + + /* Only test with mirrors */ + if (zs->zs_mirrors < 2) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* requires feature@allocation_classes */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { + mutex_exit(&ztest_vdev_lock); + return; + } + + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + spa_config_exit(spa, SCL_VDEV, FTAG); + + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + + /* + * 50% of the time allow small blocks in the special class + */ + if (error == 0 && + spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { + if (ztest_opts.zo_verbose >= 3) + (void) printf("Enabling special VDEV small blocks\n"); + (void) ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); + } + + mutex_exit(&ztest_vdev_lock); + + if (ztest_opts.zo_verbose >= 3) { + metaslab_class_t *mc; + + if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) + mc = spa_special_class(spa); + else + mc = spa_dedup_class(spa); + (void) printf("Added a %s mirrored vdev (of %d)\n", + class, (int)mc->mc_groups); + } +} + /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ @@ -3114,7 +3262,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, - (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); + (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); error = spa_vdev_add(spa, nvroot); switch (error) { @@ -3316,11 +3464,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * Locate this vdev. */ oldvd = rvd->vdev_child[top]; + + /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; } + + /* pick a child out of the raidz group */ if (ztest_opts.zo_raidz > 1) { ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); @@ -3422,7 +3574,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, - ashift, 0, 0, 0, 1); + ashift, NULL, 0, 0, 1); error = spa_vdev_attach(spa, oldguid, root, replacing); @@ -3688,7 +3840,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) return; } ASSERT(psize > 0); - newsize = psize + psize / 8; + newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { @@ -7027,6 +7179,7 @@ make_random_props(void) nvlist_t *props; VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); + if (ztest_random(2) == 0) return (props); @@ -7113,7 +7266,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); /* |