aboutsummaryrefslogtreecommitdiffstats
path: root/cmd
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2020-11-13 13:51:51 -0800
committerGitHub <[email protected]>2020-11-13 13:51:51 -0800
commitb2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch)
tree6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /cmd
parenta724db03740133c46b9a577b41a6f7221acd3e1f (diff)
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create <pool> draid[1,2,3] <vdevs...> Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create <pool> \ draid[<parity>][:<data>d][:<children>c][:<spares>s] \ <vdevs...> - draid[parity] - Parity level (default 1) - draid[:<data>d] - Data devices per group (default 8) - draid[:<children>c] - Expected number of child vdevs - draid[:<spares>s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D <value> - dRAID data drives per group -S <value> - dRAID distributed hot spares -R <value> - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Don Brady <[email protected]> Co-authored-by: Matthew Ahrens <[email protected]> Co-authored-by: Brian Behlendorf <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #10102
Diffstat (limited to 'cmd')
-rw-r--r--cmd/raidz_test/raidz_bench.c25
-rw-r--r--cmd/raidz_test/raidz_test.c330
-rw-r--r--cmd/raidz_test/raidz_test.h9
-rw-r--r--cmd/zdb/zdb.c10
-rw-r--r--cmd/zed/agents/zfs_mod.c10
-rw-r--r--cmd/zed/agents/zfs_retire.c11
-rw-r--r--cmd/zfs/zfs_main.c130
-rw-r--r--cmd/zpool/zpool_main.c5
-rw-r--r--cmd/zpool/zpool_vdev.c393
-rw-r--r--cmd/ztest/ztest.c281
10 files changed, 1011 insertions, 193 deletions
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c
index 8a2cec4ca..a3446c52c 100644
--- a/cmd/raidz_test/raidz_bench.c
+++ b/cmd/raidz_test/raidz_bench.c
@@ -83,8 +83,17 @@ run_gen_bench_impl(const char *impl)
/* create suitable raidz_map */
ncols = rto_opts.rto_dcols + fn + 1;
zio_bench.io_size = 1ULL << ds;
- rm_bench = vdev_raidz_map_alloc(&zio_bench,
- BENCH_ASHIFT, ncols, fn+1);
+
+ if (rto_opts.rto_expand) {
+ rm_bench = vdev_raidz_map_alloc_expanded(
+ zio_bench.io_abd,
+ zio_bench.io_size, zio_bench.io_offset,
+ rto_opts.rto_ashift, ncols+1, ncols,
+ fn+1, rto_opts.rto_expand_offset);
+ } else {
+ rm_bench = vdev_raidz_map_alloc(&zio_bench,
+ BENCH_ASHIFT, ncols, fn+1);
+ }
/* estimate iteration count */
iter_cnt = GEN_BENCH_MEMORY;
@@ -163,8 +172,16 @@ run_rec_bench_impl(const char *impl)
(1ULL << BENCH_ASHIFT))
continue;
- rm_bench = vdev_raidz_map_alloc(&zio_bench,
- BENCH_ASHIFT, ncols, PARITY_PQR);
+ if (rto_opts.rto_expand) {
+ rm_bench = vdev_raidz_map_alloc_expanded(
+ zio_bench.io_abd,
+ zio_bench.io_size, zio_bench.io_offset,
+ BENCH_ASHIFT, ncols+1, ncols,
+ PARITY_PQR, rto_opts.rto_expand_offset);
+ } else {
+ rm_bench = vdev_raidz_map_alloc(&zio_bench,
+ BENCH_ASHIFT, ncols, PARITY_PQR);
+ }
/* estimate iteration count */
iter_cnt = (REC_BENCH_MEMORY);
diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c
index 66f36b0d5..4e2639f36 100644
--- a/cmd/raidz_test/raidz_test.c
+++ b/cmd/raidz_test/raidz_test.c
@@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force)
(void) fprintf(stdout, DBLSEP "Running with options:\n"
" (-a) zio ashift : %zu\n"
" (-o) zio offset : 1 << %zu\n"
+ " (-e) expanded map : %s\n"
+ " (-r) reflow offset : %llx\n"
" (-d) number of raidz data columns : %zu\n"
" (-s) size of DATA : 1 << %zu\n"
" (-S) sweep parameters : %s \n"
" (-v) verbose : %s \n\n",
- opts->rto_ashift, /* -a */
- ilog2(opts->rto_offset), /* -o */
- opts->rto_dcols, /* -d */
- ilog2(opts->rto_dsize), /* -s */
- opts->rto_sweep ? "yes" : "no", /* -S */
- verbose); /* -v */
+ opts->rto_ashift, /* -a */
+ ilog2(opts->rto_offset), /* -o */
+ opts->rto_expand ? "yes" : "no", /* -e */
+ (u_longlong_t)opts->rto_expand_offset, /* -r */
+ opts->rto_dcols, /* -d */
+ ilog2(opts->rto_dsize), /* -s */
+ opts->rto_sweep ? "yes" : "no", /* -S */
+ verbose); /* -v */
}
}
@@ -104,6 +108,8 @@ static void usage(boolean_t requested)
"\t[-S parameter sweep (default: %s)]\n"
"\t[-t timeout for parameter sweep test]\n"
"\t[-B benchmark all raidz implementations]\n"
+ "\t[-e use expanded raidz map (default: %s)]\n"
+ "\t[-r expanded raidz map reflow offset (default: %llx)]\n"
"\t[-v increase verbosity (default: %zu)]\n"
"\t[-h (print help)]\n"
"\t[-T test the test, see if failure would be detected]\n"
@@ -114,6 +120,8 @@ static void usage(boolean_t requested)
o->rto_dcols, /* -d */
ilog2(o->rto_dsize), /* -s */
rto_opts.rto_sweep ? "yes" : "no", /* -S */
+ rto_opts.rto_expand ? "yes" : "no", /* -e */
+ (u_longlong_t)o->rto_expand_offset, /* -r */
o->rto_v); /* -d */
exit(requested ? 0 : 1);
@@ -128,7 +136,7 @@ static void process_options(int argc, char **argv)
bcopy(&rto_opts_defaults, o, sizeof (*o));
- while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
+ while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
value = 0;
switch (opt) {
@@ -136,6 +144,12 @@ static void process_options(int argc, char **argv)
value = strtoull(optarg, NULL, 0);
o->rto_ashift = MIN(13, MAX(9, value));
break;
+ case 'e':
+ o->rto_expand = 1;
+ break;
+ case 'r':
+ o->rto_expand_offset = strtoull(optarg, NULL, 0);
+ break;
case 'o':
value = strtoull(optarg, NULL, 0);
o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
@@ -179,25 +193,34 @@ static void process_options(int argc, char **argv)
}
}
-#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
-#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
+#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
+#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
-#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
-#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
+#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
+#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
static int
cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
{
- int i, ret = 0;
+ int r, i, ret = 0;
VERIFY(parity >= 1 && parity <= 3);
- for (i = 0; i < parity; i++) {
- if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
- != 0) {
- ret++;
- LOG_OPT(D_DEBUG, opts,
- "\nParity block [%d] different!\n", i);
+ for (r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t * const rr = rm->rm_row[r];
+ raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
+ for (i = 0; i < parity; i++) {
+ if (CODE_COL_SIZE(rrg, i) == 0) {
+ VERIFY0(CODE_COL_SIZE(rr, i));
+ continue;
+ }
+
+ if (abd_cmp(CODE_COL(rr, i),
+ CODE_COL(rrg, i)) != 0) {
+ ret++;
+ LOG_OPT(D_DEBUG, opts,
+ "\nParity block [%d] different!\n", i);
+ }
}
}
return (ret);
@@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
static int
cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
{
- int i, ret = 0;
- int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
+ int r, i, dcols, ret = 0;
+
+ for (r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t *rr = rm->rm_row[r];
+ raidz_row_t *rrg = opts->rm_golden->rm_row[r];
+ dcols = opts->rm_golden->rm_row[0]->rr_cols -
+ raidz_parity(opts->rm_golden);
+ for (i = 0; i < dcols; i++) {
+ if (DATA_COL_SIZE(rrg, i) == 0) {
+ VERIFY0(DATA_COL_SIZE(rr, i));
+ continue;
+ }
- for (i = 0; i < dcols; i++) {
- if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
- != 0) {
- ret++;
+ if (abd_cmp(DATA_COL(rrg, i),
+ DATA_COL(rr, i)) != 0) {
+ ret++;
- LOG_OPT(D_DEBUG, opts,
- "\nData block [%d] different!\n", i);
+ LOG_OPT(D_DEBUG, opts,
+ "\nData block [%d] different!\n", i);
+ }
}
}
return (ret);
@@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private)
static void
corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
{
- int i;
- raidz_col_t *col;
-
- for (i = 0; i < cnt; i++) {
- col = &rm->rm_col[tgts[i]];
- abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
+ for (int r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t *rr = rm->rm_row[r];
+ for (int i = 0; i < cnt; i++) {
+ raidz_col_t *col = &rr->rr_col[tgts[i]];
+ abd_iterate_func(col->rc_abd, 0, col->rc_size,
+ init_rand, NULL);
+ }
}
}
@@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
VERIFY0(vdev_raidz_impl_set("original"));
- opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
- opts->rto_ashift, total_ncols, parity);
- rm_test = vdev_raidz_map_alloc(zio_test,
- opts->rto_ashift, total_ncols, parity);
+ if (opts->rto_expand) {
+ opts->rm_golden =
+ vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
+ opts->zio_golden->io_size, opts->zio_golden->io_offset,
+ opts->rto_ashift, total_ncols+1, total_ncols,
+ parity, opts->rto_expand_offset);
+ rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
+ zio_test->io_size, zio_test->io_offset,
+ opts->rto_ashift, total_ncols+1, total_ncols,
+ parity, opts->rto_expand_offset);
+ } else {
+ opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
+ opts->rto_ashift, total_ncols, parity);
+ rm_test = vdev_raidz_map_alloc(zio_test,
+ opts->rto_ashift, total_ncols, parity);
+ }
VERIFY(opts->zio_golden);
VERIFY(opts->rm_golden);
@@ -312,6 +358,188 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
return (err);
}
+/*
+ * If reflow is not in progress, reflow_offset should be UINT64_MAX.
+ * For each row, if the row is entirely before reflow_offset, it will
+ * come from the new location. Otherwise this row will come from the
+ * old location. Therefore, rows that straddle the reflow_offset will
+ * come from the old location.
+ *
+ * NOTE: Until raidz expansion is implemented this function is only
+ * needed by raidz_test.c to the multi-row raid_map_t functionality.
+ */
+raidz_map_t *
+vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+ uint64_t nparity, uint64_t reflow_offset)
+{
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> ashift;
+ uint64_t q, r, bc, devidx, asize = 0, tot;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ * AKA "full rows"
+ */
+ q = s / (logical_cols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ r = s - q * (logical_cols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* How many rows contain data (not skip) */
+ uint64_t rows = howmany(tot, logical_cols);
+ int cols = MIN(tot, logical_cols);
+
+ raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+ KM_SLEEP);
+ rm->rm_nrows = rows;
+
+ for (uint64_t row = 0; row < rows; row++) {
+ raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
+ rr_col[cols]), KM_SLEEP);
+ rm->rm_row[row] = rr;
+
+ /* The starting RAIDZ (parent) vdev sector of the row. */
+ uint64_t b = (offset >> ashift) + row * logical_cols;
+
+ /*
+ * If we are in the middle of a reflow, and any part of this
+ * row has not been copied, then use the old location of
+ * this row.
+ */
+ int row_phys_cols = physical_cols;
+ if (b + (logical_cols - nparity) > reflow_offset >> ashift)
+ row_phys_cols--;
+
+ /* starting child of this row */
+ uint64_t child_id = b % row_phys_cols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t child_offset = (b / row_phys_cols) << ashift;
+
+ /*
+ * We set cols to the entire width of the block, even
+ * if this row is shorter. This is needed because parity
+ * generation (for Q and R) needs to know the entire width,
+ * because it treats the short row as though it was
+ * full-width (and the "phantom" sectors were zero-filled).
+ *
+ * Another approach to this would be to set cols shorter
+ * (to just the number of columns that we might do i/o to)
+ * and have another mechanism to tell the parity generation
+ * about the "entire width". Reconstruction (at least
+ * vdev_raidz_reconstruct_general()) would also need to
+ * know about the "entire width".
+ */
+ rr->rr_cols = cols;
+ rr->rr_bigcols = bc;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
+ rr->rr_abd_empty = NULL;
+ rr->rr_nempty = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++, child_id++) {
+ if (child_id >= row_phys_cols) {
+ child_id -= row_phys_cols;
+ child_offset += 1ULL << ashift;
+ }
+ rr->rr_col[c].rc_devidx = child_id;
+ rr->rr_col[c].rc_offset = child_offset;
+ rr->rr_col[c].rc_gdata = NULL;
+ rr->rr_col[c].rc_orig_data = NULL;
+ rr->rr_col[c].rc_error = 0;
+ rr->rr_col[c].rc_tried = 0;
+ rr->rr_col[c].rc_skipped = 0;
+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+ uint64_t dc = c - rr->rr_firstdatacol;
+ if (c < rr->rr_firstdatacol) {
+ rr->rr_col[c].rc_size = 1ULL << ashift;
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size,
+ B_TRUE);
+ } else if (row == rows - 1 && bc != 0 && c >= bc) {
+ /*
+ * Past the end, this for parity generation.
+ */
+ rr->rr_col[c].rc_size = 0;
+ rr->rr_col[c].rc_abd = NULL;
+ } else {
+ /*
+ * "data column" (col excluding parity)
+ * Add an ASCII art diagram here
+ */
+ uint64_t off;
+
+ if (c < bc || r == 0) {
+ off = dc * rows + row;
+ } else {
+ off = r * rows +
+ (dc - r) * (rows - 1) + row;
+ }
+ rr->rr_col[c].rc_size = 1ULL << ashift;
+ rr->rr_col[c].rc_abd =
+ abd_get_offset(abd, off << ashift);
+ }
+
+ asize += rr->rr_col[c].rc_size;
+ }
+ /*
+ * If all data stored spans all columns, there's a danger that
+ * parity will always be on the same device and, since parity
+ * isn't read during normal operation, that that device's I/O
+ * bandwidth won't be used effectively. We therefore switch
+ * the parity every 1MB.
+ *
+ * ...at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices
+ * evenly, we won't see any benefit. Further, occasional writes
+ * that aren't a multiple of the LCM of the number of children
+ * and the minimum stripe width are sufficient to avoid pessimal
+ * behavior. Unfortunately, this decision created an implicit
+ * on-disk format requirement that we need to support for all
+ * eternity, but only for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for
+ * padding we must make sure to note this swap. We will never
+ * intend to skip the first column since at least one data and
+ * one parity column must appear in each row.
+ */
+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+ (offset & (1ULL << 20))) {
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+ devidx = rr->rr_col[0].rc_devidx;
+ uint64_t o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
+ }
+
+ }
+ ASSERT3U(asize, ==, tot << ashift);
+
+ /* init RAIDZ parity ops */
+ rm->rm_ops = vdev_raidz_math_get_ops();
+
+ return (rm);
+}
+
static raidz_map_t *
init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
{
@@ -330,8 +558,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
(*zio)->io_abd = raidz_alloc(alloc_dsize);
init_zio_abd(*zio);
- rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
- total_ncols, parity);
+ if (opts->rto_expand) {
+ rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
+ (*zio)->io_size, (*zio)->io_offset,
+ opts->rto_ashift, total_ncols+1, total_ncols,
+ parity, opts->rto_expand_offset);
+ } else {
+ rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
+ total_ncols, parity);
+ }
VERIFY(rm);
/* Make sure code columns are destroyed */
@@ -420,7 +655,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
if (fn < RAIDZ_REC_PQ) {
/* can reconstruct 1 failed data disk */
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
- if (x0 >= rm->rm_cols - raidz_parity(rm))
+ if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
continue;
/* Check if should stop */
@@ -445,10 +680,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
} else if (fn < RAIDZ_REC_PQR) {
/* can reconstruct 2 failed data disk */
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
- if (x0 >= rm->rm_cols - raidz_parity(rm))
+ if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
continue;
for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
- if (x1 >= rm->rm_cols - raidz_parity(rm))
+ if (x1 >= rm->rm_row[0]->rr_cols -
+ raidz_parity(rm))
continue;
/* Check if should stop */
@@ -475,14 +711,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
} else {
/* can reconstruct 3 failed data disk */
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
- if (x0 >= rm->rm_cols - raidz_parity(rm))
+ if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
continue;
for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
- if (x1 >= rm->rm_cols - raidz_parity(rm))
+ if (x1 >= rm->rm_row[0]->rr_cols -
+ raidz_parity(rm))
continue;
for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
- if (x2 >=
- rm->rm_cols - raidz_parity(rm))
+ if (x2 >= rm->rm_row[0]->rr_cols -
+ raidz_parity(rm))
continue;
/* Check if should stop */
@@ -700,6 +937,8 @@ run_sweep(void)
opts->rto_dcols = dcols_v[d];
opts->rto_offset = (1 << ashift_v[a]) * rand();
opts->rto_dsize = size_v[s];
+ opts->rto_expand = rto_opts.rto_expand;
+ opts->rto_expand_offset = rto_opts.rto_expand_offset;
opts->rto_v = 0; /* be quiet */
VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
@@ -732,6 +971,7 @@ exit:
return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
}
+
int
main(int argc, char **argv)
{
diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h
index 09c825ae4..0f7f4cee3 100644
--- a/cmd/raidz_test/raidz_test.h
+++ b/cmd/raidz_test/raidz_test.h
@@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = {
typedef struct raidz_test_opts {
size_t rto_ashift;
- size_t rto_offset;
+ uint64_t rto_offset;
size_t rto_dcols;
size_t rto_dsize;
size_t rto_v;
size_t rto_sweep;
size_t rto_sweep_timeout;
size_t rto_benchmark;
+ size_t rto_expand;
+ uint64_t rto_expand_offset;
size_t rto_sanity;
size_t rto_gdb;
@@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = {
.rto_v = 0,
.rto_sweep = 0,
.rto_benchmark = 0,
+ .rto_expand = 0,
+ .rto_expand_offset = -1ULL,
.rto_sanity = 0,
.rto_gdb = 0,
.rto_should_stop = B_FALSE
@@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio);
void run_raidz_benchmark(void);
+struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
+ uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
+
#endif /* RAIDZ_TEST_H */
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index dbf09a652..d4a37dee0 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1642,7 +1642,11 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
- ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+ if (vd->vdev_ops == &vdev_draid_ops)
+ ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
+ else
+ ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
+
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
@@ -5203,8 +5207,6 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
- abd_free(zio->io_abd);
-
mutex_enter(&spa->spa_scrub_lock);
spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
cv_broadcast(&spa->spa_scrub_io_cv);
@@ -5231,6 +5233,8 @@ zdb_blkptr_done(zio_t *zio)
blkbuf);
}
mutex_exit(&spa->spa_scrub_lock);
+
+ abd_free(zio->io_abd);
}
static int
diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index 8190beb0c..4a58e1f1d 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
return;
}
- ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
+ /*
+ * Prefer sequential resilvering when supported (mirrors and dRAID),
+ * otherwise fallback to a traditional healing resilver.
+ */
+ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE);
+ if (ret != 0) {
+ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot,
+ B_TRUE, B_FALSE);
+ }
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index ba8a6de3a..89bb84e48 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
* replace it.
*/
for (s = 0; s < nspares; s++) {
- char *spare_name;
+ boolean_t rebuild = B_FALSE;
+ char *spare_name, *type;
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
&spare_name) != 0)
continue;
+ /* prefer sequential resilvering for distributed spares */
+ if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
+ &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
+ rebuild = B_TRUE;
+
/* if set, add the "ashift" pool property to the spare nvlist */
if (source != ZPROP_SRC_DEFAULT)
(void) nvlist_add_uint64(spares[s],
@@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
dev_name, basename(spare_name));
if (zpool_vdev_attach(zhp, dev_name, spare_name,
- replacement, B_TRUE, B_FALSE) == 0) {
+ replacement, B_TRUE, rebuild) == 0) {
free(dev_name);
nvlist_free(replacement);
return (B_TRUE);
@@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
* Attempt to substitute a hot spare.
*/
(void) replace_with_spare(hdl, zhp, vdev);
+
zpool_close(zhp);
}
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index f609a4e70..340a7db96 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -893,6 +893,107 @@ usage:
}
/*
+ * Return a default volblocksize for the pool which always uses more than
+ * half of the data sectors. This primarily applies to dRAID which always
+ * writes full stripe widths.
+ */
+static uint64_t
+default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+{
+ uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+ nvlist_t *tree, **vdevs;
+ uint_t nvdevs;
+
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
+ nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
+ &vdevs, &nvdevs) != 0) {
+ return (ZVOL_DEFAULT_BLOCKSIZE);
+ }
+
+ for (int i = 0; i < nvdevs; i++) {
+ nvlist_t *nv = vdevs[i];
+ uint64_t ashift, ndata, nparity;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA,
+ &ndata) == 0) {
+ /* dRAID minimum allocation width */
+ asize = MAX(asize, ndata * (1ULL << ashift));
+ } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &nparity) == 0) {
+ /* raidz minimum allocation width */
+ if (nparity == 1)
+ asize = MAX(asize, 2 * (1ULL << ashift));
+ else
+ asize = MAX(asize, 4 * (1ULL << ashift));
+ } else {
+ /* mirror or (non-redundant) leaf vdev */
+ asize = MAX(asize, 1ULL << ashift);
+ }
+ }
+
+ /*
+ * Calculate the target volblocksize such that more than half
+ * of the asize is used. The following table is for 4k sectors.
+ *
+ * n asize blksz used | n asize blksz used
+ * -------------------------+---------------------------------
+ * 1 4,096 8,192 100% | 9 36,864 32,768 88%
+ * 2 8,192 8,192 100% | 10 40,960 32,768 80%
+ * 3 12,288 8,192 66% | 11 45,056 32,768 72%
+ * 4 16,384 16,384 100% | 12 49,152 32,768 66%
+ * 5 20,480 16,384 80% | 13 53,248 32,768 61%
+ * 6 24,576 16,384 66% | 14 57,344 32,768 57%
+ * 7 28,672 16,384 57% | 15 61,440 32,768 53%
+ * 8 32,768 32,768 100% | 16 65,536 65,636 100%
+ *
+ * This is primarily a concern for dRAID which always allocates
+ * a full stripe width. For dRAID the default stripe width is
+ * n=8 in which case the volblocksize is set to 32k. Ignoring
+ * compression there are no unused sectors. This same reasoning
+ * applies to raidz[2,3] so target 4 sectors to minimize waste.
+ */
+ uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+ while (tgt_volblocksize * 2 <= asize)
+ tgt_volblocksize *= 2;
+
+ const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
+ if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) {
+
+ /* Issue a warning when a non-optimal size is requested. */
+ if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) {
+ (void) fprintf(stderr, gettext("Warning: "
+ "volblocksize (%llu) is less than the default "
+ "minimum block size (%llu).\nTo reduce wasted "
+ "space a volblocksize of %llu is recommended.\n"),
+ (u_longlong_t)volblocksize,
+ (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE,
+ (u_longlong_t)tgt_volblocksize);
+ } else if (volblocksize < tgt_volblocksize) {
+ (void) fprintf(stderr, gettext("Warning: "
+ "volblocksize (%llu) is much less than the "
+ "minimum allocation\nunit (%llu), which wastes "
+ "at least %llu%% of space. To reduce wasted "
+ "space,\nuse a larger volblocksize (%llu is "
+ "recommended), fewer dRAID data disks\n"
+ "per group, or smaller sector size (ashift).\n"),
+ (u_longlong_t)volblocksize, (u_longlong_t)asize,
+ (u_longlong_t)((100 * (asize - volblocksize)) /
+ asize), (u_longlong_t)tgt_volblocksize);
+ }
+ } else {
+ volblocksize = tgt_volblocksize;
+ fnvlist_add_uint64(props, prop, volblocksize);
+ }
+
+ return (volblocksize);
+}
+
+/*
* zfs create [-Pnpv] [-o prop=value] ... fs
* zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size
*
@@ -932,6 +1033,7 @@ zfs_do_create(int argc, char **argv)
int ret = 1;
nvlist_t *props;
uint64_t intval;
+ char *strval;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
@@ -1018,7 +1120,7 @@ zfs_do_create(int argc, char **argv)
goto badusage;
}
- if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) {
+ if (dryrun || type == ZFS_TYPE_VOLUME) {
char msg[ZFS_MAX_DATASET_NAME_LEN * 2];
char *p;
@@ -1040,18 +1142,24 @@ zfs_do_create(int argc, char **argv)
}
}
- /*
- * if volsize is not a multiple of volblocksize, round it up to the
- * nearest multiple of the volblocksize
- */
if (type == ZFS_TYPE_VOLUME) {
- uint64_t volblocksize;
+ const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
+ uint64_t volblocksize = default_volblocksize(zpool_handle,
+ real_props);
- if (nvlist_lookup_uint64(props,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
- &volblocksize) != 0)
- volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+ if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE &&
+ nvlist_lookup_string(props, prop, &strval) != 0) {
+ if (asprintf(&strval, "%llu",
+ (u_longlong_t)volblocksize) == -1)
+ nomem();
+ nvlist_add_string(props, prop, strval);
+ free(strval);
+ }
+ /*
+ * If volsize is not a multiple of volblocksize, round it
+ * up to the nearest multiple of the volblocksize.
+ */
if (volsize % volblocksize) {
volsize = P2ROUNDUP_TYPED(volsize, volblocksize,
uint64_t);
@@ -1064,11 +1172,9 @@ zfs_do_create(int argc, char **argv)
}
}
-
if (type == ZFS_TYPE_VOLUME && !noreserve) {
uint64_t spa_version;
zfs_prop_t resv_prop;
- char *strval;
spa_version = zpool_get_prop_int(zpool_handle,
ZPOOL_PROP_VERSION, NULL);
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 83a9b5a5a..524cff335 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2294,7 +2294,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}
}
- /* Display vdev initialization and trim status for leaves */
+ /* Display vdev initialization and trim status for leaves. */
if (children == 0) {
print_status_initialize(vs, cb->cb_print_vdev_init);
print_status_trim(vs, cb->cb_print_vdev_trim);
@@ -9849,7 +9849,8 @@ vdev_any_spare_replacing(nvlist_t *nv)
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);
if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
- strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
+ strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 ||
+ strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) {
return (B_TRUE);
}
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 9aa09b18c..c86081a81 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -86,9 +86,6 @@
boolean_t error_seen;
boolean_t is_force;
-
-
-
/*PRINTFLIKE1*/
void
vdev_error(const char *fmt, ...)
@@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path)
uint_t i, nspares;
boolean_t inuse;
+ if (zpool_is_draid_spare(path))
+ return (B_TRUE);
+
if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
return (B_FALSE);
@@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path)
* /dev/xxx Complete disk path
* /xxx Full path to file
* xxx Shorthand for <zfs_vdev_paths>/xxx
+ * draid* Virtual dRAID spare
*/
static nvlist_t *
-make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
+make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
{
char path[MAXPATHLEN];
struct stat64 statbuf;
@@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
/* After whole disk check restore original passed path */
strlcpy(path, arg, sizeof (path));
+ } else if (zpool_is_draid_spare(arg)) {
+ if (!is_primary) {
+ (void) fprintf(stderr,
+ gettext("cannot open '%s': dRAID spares can only "
+ "be used to replace primary vdevs\n"), arg);
+ return (NULL);
+ }
+
+ wholedisk = B_TRUE;
+ strlcpy(path, arg, sizeof (path));
+ type = VDEV_TYPE_DRAID_SPARE;
} else {
err = is_shorthand_path(arg, path, sizeof (path),
&statbuf, &wholedisk);
@@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
}
}
- /*
- * Determine whether this is a device or a file.
- */
- if (wholedisk || S_ISBLK(statbuf.st_mode)) {
- type = VDEV_TYPE_DISK;
- } else if (S_ISREG(statbuf.st_mode)) {
- type = VDEV_TYPE_FILE;
- } else {
- (void) fprintf(stderr, gettext("cannot use '%s': must be a "
- "block device or regular file\n"), path);
- return (NULL);
+ if (type == NULL) {
+ /*
+ * Determine whether this is a device or a file.
+ */
+ if (wholedisk || S_ISBLK(statbuf.st_mode)) {
+ type = VDEV_TYPE_DISK;
+ } else if (S_ISREG(statbuf.st_mode)) {
+ type = VDEV_TYPE_FILE;
+ } else {
+ fprintf(stderr, gettext("cannot use '%s': must "
+ "be a block device or regular file\n"), path);
+ return (NULL);
+ }
}
/*
@@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
- verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
- if (is_log)
- verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
- VDEV_ALLOC_BIAS_LOG) == 0);
+
if (strcmp(type, VDEV_TYPE_DISK) == 0)
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
(uint64_t)wholedisk) == 0);
@@ -432,11 +443,16 @@ typedef struct replication_level {
#define ZPOOL_FUZZ (16 * 1024 * 1024)
+/*
+ * N.B. For the purposes of comparing replication levels dRAID can be
+ * considered functionally equivilant to raidz.
+ */
static boolean_t
is_raidz_mirror(replication_level_t *a, replication_level_t *b,
replication_level_t **raidz, replication_level_t **mirror)
{
- if (strcmp(a->zprl_type, "raidz") == 0 &&
+ if ((strcmp(a->zprl_type, "raidz") == 0 ||
+ strcmp(a->zprl_type, "draid") == 0) &&
strcmp(b->zprl_type, "mirror") == 0) {
*raidz = a;
*mirror = b;
@@ -446,6 +462,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b,
}
/*
+ * Comparison for determining if dRAID and raidz where passed in either order.
+ */
+static boolean_t
+is_raidz_draid(replication_level_t *a, replication_level_t *b)
+{
+ if ((strcmp(a->zprl_type, "raidz") == 0 ||
+ strcmp(a->zprl_type, "draid") == 0) &&
+ (strcmp(b->zprl_type, "raidz") == 0 ||
+ strcmp(b->zprl_type, "draid") == 0)) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
* an error message will be displayed for each self-inconsistent vdev.
@@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
rep.zprl_type = type;
rep.zprl_children = 0;
- if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
+ strcmp(type, VDEV_TYPE_DRAID) == 0) {
verify(nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_NPARITY,
&rep.zprl_parity) == 0);
@@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
else
return (NULL);
}
+ } else if (is_raidz_draid(&lastrep, &rep)) {
+ /*
+ * Accepted raidz and draid when they can
+ * handle the same number of disk failures.
+ */
+ if (lastrep.zprl_parity != rep.zprl_parity) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication "
+ "level: %s and %s vdevs "
+ "with different "
+ "redundancy, %llu vs. "
+ "%llu are present\n"),
+ lastrep.zprl_type,
+ rep.zprl_type,
+ lastrep.zprl_parity,
+ rep.zprl_parity);
+ else
+ return (NULL);
+ }
} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
0) {
if (ret != NULL)
@@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
return (anyinuse);
}
-static const char *
-is_grouping(const char *type, int *mindev, int *maxdev)
+/*
+ * Returns the parity level extracted from a raidz or draid type.
+ * If the parity cannot be determined zero is returned.
+ */
+static int
+get_parity(const char *type)
{
- if (strncmp(type, "raidz", 5) == 0) {
- const char *p = type + 5;
- char *end;
- long nparity;
+ long parity = 0;
+ const char *p;
+
+ if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
+ p = type + strlen(VDEV_TYPE_RAIDZ);
if (*p == '\0') {
- nparity = 1;
+ /* when unspecified default to single parity */
+ return (1);
} else if (*p == '0') {
- return (NULL); /* no zero prefixes allowed */
+ /* no zero prefixes allowed */
+ return (0);
} else {
+ /* 0-3, no suffixes allowed */
+ char *end;
errno = 0;
- nparity = strtol(p, &end, 10);
- if (errno != 0 || nparity < 1 || nparity >= 255 ||
- *end != '\0')
- return (NULL);
+ parity = strtol(p, &end, 10);
+ if (errno != 0 || *end != '\0' ||
+ parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
+ return (0);
+ }
+ }
+ } else if (strncmp(type, VDEV_TYPE_DRAID,
+ strlen(VDEV_TYPE_DRAID)) == 0) {
+ p = type + strlen(VDEV_TYPE_DRAID);
+
+ if (*p == '\0' || *p == ':') {
+ /* when unspecified default to single parity */
+ return (1);
+ } else if (*p == '0') {
+ /* no zero prefixes allowed */
+ return (0);
+ } else {
+ /* 0-3, allowed suffixes: '\0' or ':' */
+ char *end;
+ errno = 0;
+ parity = strtol(p, &end, 10);
+ if (errno != 0 ||
+ parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
+ (*end != '\0' && *end != ':')) {
+ return (0);
+ }
}
+ }
+
+ return ((int)parity);
+}
+
+/*
+ * Assign the minimum and maximum number of devices allowed for
+ * the specified type. On error NULL is returned, otherwise the
+ * type prefix is returned (raidz, mirror, etc).
+ */
+static const char *
+is_grouping(const char *type, int *mindev, int *maxdev)
+{
+ int nparity;
+ if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+ strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
+ nparity = get_parity(type);
+ if (nparity == 0)
+ return (NULL);
if (mindev != NULL)
*mindev = nparity + 1;
if (maxdev != NULL)
*maxdev = 255;
- return (VDEV_TYPE_RAIDZ);
+
+ if (strncmp(type, VDEV_TYPE_RAIDZ,
+ strlen(VDEV_TYPE_RAIDZ)) == 0) {
+ return (VDEV_TYPE_RAIDZ);
+ } else {
+ return (VDEV_TYPE_DRAID);
+ }
}
if (maxdev != NULL)
@@ -1168,6 +1280,163 @@ is_grouping(const char *type, int *mindev, int *maxdev)
}
/*
+ * Extract the configuration parameters encoded in the dRAID type and
+ * use them to generate a dRAID configuration. The expected format is:
+ *
+ * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
+ *
+ * The intent is to be able to generate a good configuration when no
+ * additional information is provided. The only mandatory component
+ * of the 'type' is the 'draid' prefix. If a value is not provided
+ * then reasonable defaults are used. The optional components may
+ * appear in any order but the d/s/c suffix is required.
+ *
+ * Valid inputs:
+ * - data: number of data devices per group (1-255)
+ * - parity: number of parity blocks per group (1-3)
+ * - spares: number of distributed spare (0-100)
+ * - children: total number of devices (1-255)
+ *
+ * Examples:
+ * - zpool create tank draid <devices...>
+ * - zpool create tank draid2:8d:51c:2s <devices...>
+ */
+static int
+draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
+{
+ uint64_t nparity = 1;
+ uint64_t nspares = 0;
+ uint64_t ndata = UINT64_MAX;
+ uint64_t ngroups = 1;
+ long value;
+
+ if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
+ return (EINVAL);
+
+ nparity = (uint64_t)get_parity(type);
+ if (nparity == 0)
+ return (EINVAL);
+
+ char *p = (char *)type;
+ while ((p = strchr(p, ':')) != NULL) {
+ char *end;
+
+ p = p + 1;
+ errno = 0;
+
+ if (!isdigit(p[0])) {
+ (void) fprintf(stderr, gettext("invalid dRAID "
+ "syntax; expected [:<number><c|d|s>] not '%s'\n"),
+ type);
+ return (EINVAL);
+ }
+
+ /* Expected non-zero value with c/d/s suffix */
+ value = strtol(p, &end, 10);
+ char suffix = tolower(*end);
+ if (errno != 0 ||
+ (suffix != 'c' && suffix != 'd' && suffix != 's')) {
+ (void) fprintf(stderr, gettext("invalid dRAID "
+ "syntax; expected [:<number><c|d|s>] not '%s'\n"),
+ type);
+ return (EINVAL);
+ }
+
+ if (suffix == 'c') {
+ if ((uint64_t)value != children) {
+ fprintf(stderr,
+ gettext("invalid number of dRAID children; "
+ "%llu required but %llu provided\n"),
+ (u_longlong_t)value,
+ (u_longlong_t)children);
+ return (EINVAL);
+ }
+ } else if (suffix == 'd') {
+ ndata = (uint64_t)value;
+ } else if (suffix == 's') {
+ nspares = (uint64_t)value;
+ } else {
+ verify(0); /* Unreachable */
+ }
+ }
+
+ /*
+ * When a specific number of data disks is not provided limit a
+ * redundancy group to 8 data disks. This value was selected to
+ * provide a reasonable tradeoff between capacity and performance.
+ */
+ if (ndata == UINT64_MAX) {
+ if (children > nspares + nparity) {
+ ndata = MIN(children - nspares - nparity, 8);
+ } else {
+ fprintf(stderr, gettext("request number of "
+ "distributed spares %llu and parity level %llu\n"
+ "leaves no disks available for data\n"),
+ (u_longlong_t)nspares, (u_longlong_t)nparity);
+ return (EINVAL);
+ }
+ }
+
+ /* Verify the maximum allowed group size is never exceeded. */
+ if (ndata == 0 || (ndata + nparity > children - nspares)) {
+ fprintf(stderr, gettext("requested number of dRAID data "
+ "disks per group %llu is too high,\nat most %llu disks "
+ "are available for data\n"), (u_longlong_t)ndata,
+ (u_longlong_t)(children - nspares - nparity));
+ return (EINVAL);
+ }
+
+ if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
+ fprintf(stderr,
+ gettext("invalid dRAID parity level %llu; must be "
+ "between 1 and %d\n"), (u_longlong_t)nparity,
+ VDEV_DRAID_MAXPARITY);
+ return (EINVAL);
+ }
+
+ /*
+ * Verify the requested number of spares can be satisfied.
+ * An arbitrary limit of 100 distributed spares is applied.
+ */
+ if (nspares > 100 || nspares > (children - (ndata + nparity))) {
+ fprintf(stderr,
+ gettext("invalid number of dRAID spares %llu; additional "
+ "disks would be required\n"), (u_longlong_t)nspares);
+ return (EINVAL);
+ }
+
+ /* Verify the requested number children is sufficient. */
+ if (children < (ndata + nparity + nspares)) {
+ fprintf(stderr, gettext("%llu disks were provided, but at "
+ "least %llu disks are required for this config\n"),
+ (u_longlong_t)children,
+ (u_longlong_t)(ndata + nparity + nspares));
+ }
+
+ if (children > VDEV_DRAID_MAX_CHILDREN) {
+ fprintf(stderr, gettext("%llu disks were provided, but "
+ "dRAID only supports up to %u disks"),
+ (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
+ }
+
+ /*
+ * Calculate the minimum number of groups required to fill a slice.
+ * This is the LCM of the stripe width (ndata + nparity) and the
+ * number of data drives (children - nspares).
+ */
+ while (ngroups * (ndata + nparity) % (children - nspares) != 0)
+ ngroups++;
+
+ /* Store the basic dRAID configuration. */
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
+
+ return (0);
+}
+
+/*
* Construct a syntactically valid vdev specification,
* and ensure that all devices and files exist and can be opened.
* Note: we don't bother freeing anything in the error paths
@@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
{
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
- const char *type;
- uint64_t is_log, is_special, is_dedup;
+ const char *type, *fulltype;
+ boolean_t is_log, is_special, is_dedup, is_spare;
boolean_t seen_logs;
top = NULL;
@@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv)
nspares = 0;
nlogs = 0;
nl2cache = 0;
- is_log = is_special = is_dedup = B_FALSE;
+ is_log = is_special = is_dedup = is_spare = B_FALSE;
seen_logs = B_FALSE;
nvroot = NULL;
while (argc > 0) {
+ fulltype = argv[0];
nv = NULL;
/*
- * If it's a mirror or raidz, the subsequent arguments are
- * its leaves -- until we encounter the next mirror or raidz.
+ * If it's a mirror, raidz, or draid the subsequent arguments
+ * are its leaves -- until we encounter the next mirror,
+ * raidz or draid.
*/
- if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
+ if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
nvlist_t **child = NULL;
int c, children = 0;
@@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
"specified only once\n"));
goto spec_out;
}
+ is_spare = B_TRUE;
is_log = is_special = is_dedup = B_FALSE;
}
@@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
}
seen_logs = B_TRUE;
is_log = B_TRUE;
- is_special = B_FALSE;
- is_dedup = B_FALSE;
+ is_special = is_dedup = is_spare = B_FALSE;
argc--;
argv++;
/*
@@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
is_special = B_TRUE;
- is_log = B_FALSE;
- is_dedup = B_FALSE;
+ is_log = is_dedup = is_spare = B_FALSE;
argc--;
argv++;
continue;
@@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
is_dedup = B_TRUE;
- is_log = B_FALSE;
- is_special = B_FALSE;
+ is_log = is_special = is_spare = B_FALSE;
argc--;
argv++;
continue;
@@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
"specified only once\n"));
goto spec_out;
}
- is_log = is_special = is_dedup = B_FALSE;
+ is_log = is_special = B_FALSE;
+ is_dedup = is_spare = B_FALSE;
}
if (is_log || is_special || is_dedup) {
@@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
for (c = 1; c < argc; c++) {
if (is_grouping(argv[c], NULL, NULL) != NULL)
break;
+
children++;
child = realloc(child,
children * sizeof (nvlist_t *));
if (child == NULL)
zpool_no_memory();
if ((nv = make_leaf_vdev(props, argv[c],
- B_FALSE)) == NULL) {
+ !(is_log || is_special || is_dedup ||
+ is_spare))) == NULL) {
for (c = 0; c < children - 1; c++)
nvlist_free(child[c]);
free(child);
@@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv)
type) == 0);
verify(nvlist_add_uint64(nv,
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
- if (is_log)
+ if (is_log) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,
VDEV_ALLOC_BIAS_LOG) == 0);
+ }
if (is_special) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,
@@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
ZPOOL_CONFIG_NPARITY,
mindev - 1) == 0);
}
+ if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
+ if (draid_config_by_type(nv,
+ fulltype, children) != 0) {
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ goto spec_out;
+ }
+ }
verify(nvlist_add_nvlist_array(nv,
ZPOOL_CONFIG_CHILDREN, child,
children) == 0);
@@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv)
* We have a device. Pass off to make_leaf_vdev() to
* construct the appropriate nvlist describing the vdev.
*/
- if ((nv = make_leaf_vdev(props, argv[0],
- is_log)) == NULL)
+ if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
+ is_special || is_dedup || is_spare))) == NULL)
goto spec_out;
- if (is_log)
+ verify(nvlist_add_uint64(nv,
+ ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+ if (is_log) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_LOG) == 0);
nlogs++;
+ }
+
if (is_special) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 31205a5bf..1c4da20e4 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -104,6 +104,7 @@
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
+#include <sys/vdev_draid.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_initialize.h>
@@ -167,8 +168,11 @@ typedef struct ztest_shared_opts {
size_t zo_vdev_size;
int zo_ashift;
int zo_mirrors;
- int zo_raidz;
- int zo_raidz_parity;
+ int zo_raid_children;
+ int zo_raid_parity;
+ char zo_raid_type[8];
+ int zo_draid_data;
+ int zo_draid_spares;
int zo_datasets;
int zo_threads;
uint64_t zo_passtime;
@@ -191,9 +195,12 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_vdevs = 5,
.zo_ashift = SPA_MINBLOCKSHIFT,
.zo_mirrors = 2,
- .zo_raidz = 4,
- .zo_raidz_parity = 1,
+ .zo_raid_children = 4,
+ .zo_raid_parity = 1,
+ .zo_raid_type = VDEV_TYPE_RAIDZ,
.zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
+ .zo_draid_data = 4, /* data drives */
+ .zo_draid_spares = 1, /* distributed spares */
.zo_datasets = 7,
.zo_threads = 23,
.zo_passtime = 60, /* 60 seconds */
@@ -232,7 +239,7 @@ static ztest_shared_ds_t *ztest_shared_ds;
#define BT_MAGIC 0x123456789abcdefULL
#define MAXFAULTS(zs) \
- (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
+ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1)
enum ztest_io_type {
ZTEST_IO_WRITE_TAG,
@@ -689,8 +696,11 @@ usage(boolean_t requested)
"\t[-s size_of_each_vdev (default: %s)]\n"
"\t[-a alignment_shift (default: %d)] use 0 for random\n"
"\t[-m mirror_copies (default: %d)]\n"
- "\t[-r raidz_disks (default: %d)]\n"
- "\t[-R raidz_parity (default: %d)]\n"
+ "\t[-r raidz_disks / draid_disks (default: %d)]\n"
+ "\t[-R raid_parity (default: %d)]\n"
+ "\t[-K raid_kind (default: random)] raidz|draid|random\n"
+ "\t[-D draid_data (default: %d)] in config\n"
+ "\t[-S draid_spares (default: %d)]\n"
"\t[-d datasets (default: %d)]\n"
"\t[-t threads (default: %d)]\n"
"\t[-g gang_block_threshold (default: %s)]\n"
@@ -716,8 +726,10 @@ usage(boolean_t requested)
nice_vdev_size, /* -s */
zo->zo_ashift, /* -a */
zo->zo_mirrors, /* -m */
- zo->zo_raidz, /* -r */
- zo->zo_raidz_parity, /* -R */
+ zo->zo_raid_children, /* -r */
+ zo->zo_raid_parity, /* -R */
+ zo->zo_draid_data, /* -D */
+ zo->zo_draid_spares, /* -S */
zo->zo_datasets, /* -d */
zo->zo_threads, /* -t */
nice_force_ganging, /* -g */
@@ -731,6 +743,21 @@ usage(boolean_t requested)
exit(requested ? 0 : 1);
}
+static uint64_t
+ztest_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT3S(ztest_fd_rand, >=, 0);
+
+ if (range == 0)
+ return (0);
+
+ if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
+ fatal(1, "short read from /dev/urandom");
+
+ return (r % range);
+}
static void
ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
@@ -780,11 +807,12 @@ process_options(int argc, char **argv)
int opt;
uint64_t value;
char altdir[MAXNAMELEN] = { 0 };
+ char raid_kind[8] = { "random" };
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
while ((opt = getopt(argc, argv,
- "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
+ "v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
value = 0;
switch (opt) {
case 'v':
@@ -793,6 +821,8 @@ process_options(int argc, char **argv)
case 'm':
case 'r':
case 'R':
+ case 'D':
+ case 'S':
case 'd':
case 't':
case 'g':
@@ -817,10 +847,19 @@ process_options(int argc, char **argv)
zo->zo_mirrors = value;
break;
case 'r':
- zo->zo_raidz = MAX(1, value);
+ zo->zo_raid_children = MAX(1, value);
break;
case 'R':
- zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
+ zo->zo_raid_parity = MIN(MAX(value, 1), 3);
+ break;
+ case 'K':
+ (void) strlcpy(raid_kind, optarg, sizeof (raid_kind));
+ break;
+ case 'D':
+ zo->zo_draid_data = MAX(1, value);
+ break;
+ case 'S':
+ zo->zo_draid_spares = MAX(1, value);
break;
case 'd':
zo->zo_datasets = MAX(1, value);
@@ -895,7 +934,54 @@ process_options(int argc, char **argv)
}
}
- zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
+ /* When raid choice is 'random' add a draid pool 50% of the time */
+ if (strcmp(raid_kind, "random") == 0) {
+ (void) strlcpy(raid_kind, (ztest_random(2) == 0) ?
+ "draid" : "raidz", sizeof (raid_kind));
+
+ if (ztest_opts.zo_verbose >= 3)
+ (void) printf("choosing RAID type '%s'\n", raid_kind);
+ }
+
+ if (strcmp(raid_kind, "draid") == 0) {
+ uint64_t min_devsize;
+
+ /* With fewer disk use 256M, otherwise 128M is OK */
+ min_devsize = (ztest_opts.zo_raid_children < 16) ?
+ (256ULL << 20) : (128ULL << 20);
+
+ /* No top-level mirrors with dRAID for now */
+ zo->zo_mirrors = 0;
+
+ /* Use more appropriate defaults for dRAID */
+ if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs)
+ zo->zo_vdevs = 1;
+ if (zo->zo_raid_children ==
+ ztest_opts_defaults.zo_raid_children)
+ zo->zo_raid_children = 16;
+ if (zo->zo_ashift < 12)
+ zo->zo_ashift = 12;
+ if (zo->zo_vdev_size < min_devsize)
+ zo->zo_vdev_size = min_devsize;
+
+ if (zo->zo_draid_data + zo->zo_raid_parity >
+ zo->zo_raid_children - zo->zo_draid_spares) {
+ (void) fprintf(stderr, "error: too few draid "
+ "children (%d) for stripe width (%d)\n",
+ zo->zo_raid_children,
+ zo->zo_draid_data + zo->zo_raid_parity);
+ usage(B_FALSE);
+ }
+
+ (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID,
+ sizeof (zo->zo_raid_type));
+
+ } else /* using raidz */ {
+ ASSERT0(strcmp(raid_kind, "raidz"));
+
+ zo->zo_raid_parity = MIN(zo->zo_raid_parity,
+ zo->zo_raid_children - 1);
+ }
zo->zo_vdevtime =
(zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
@@ -966,22 +1052,6 @@ ztest_kill(ztest_shared_t *zs)
(void) kill(getpid(), SIGKILL);
}
-static uint64_t
-ztest_random(uint64_t range)
-{
- uint64_t r;
-
- ASSERT3S(ztest_fd_rand, >=, 0);
-
- if (range == 0)
- return (0);
-
- if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
- fatal(1, "short read from /dev/urandom");
-
- return (r % range);
-}
-
/* ARGSUSED */
static void
ztest_record_enospc(const char *s)
@@ -997,12 +1067,27 @@ ztest_get_ashift(void)
return (ztest_opts.zo_ashift);
}
+static boolean_t
+ztest_is_draid_spare(const char *name)
+{
+ uint64_t spare_id = 0, parity = 0, vdev_id = 0;
+
+ if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu",
+ (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id,
+ (u_longlong_t *)&spare_id) == 3) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
static nvlist_t *
make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
{
char *pathbuf;
uint64_t vdev;
nvlist_t *file;
+ boolean_t draid_spare = B_FALSE;
pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
@@ -1024,9 +1109,11 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
ztest_dev_template, ztest_opts.zo_dir,
pool == NULL ? ztest_opts.zo_pool : pool, vdev);
}
+ } else {
+ draid_spare = ztest_is_draid_spare(path);
}
- if (size != 0) {
+ if (size != 0 && !draid_spare) {
int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd == -1)
fatal(1, "can't open %s", path);
@@ -1035,20 +1122,21 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
(void) close(fd);
}
- VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
- VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
- VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
- VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+ VERIFY0(nvlist_alloc(&file, NV_UNIQUE_NAME, 0));
+ VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_TYPE,
+ draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE));
+ VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path));
+ VERIFY0(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift));
umem_free(pathbuf, MAXPATHLEN);
return (file);
}
static nvlist_t *
-make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
+make_vdev_raid(char *path, char *aux, char *pool, size_t size,
uint64_t ashift, int r)
{
- nvlist_t *raidz, **child;
+ nvlist_t *raid, **child;
int c;
if (r < 2)
@@ -1058,20 +1146,41 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
for (c = 0; c < r; c++)
child[c] = make_vdev_file(path, aux, pool, size, ashift);
- VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
- VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
- VDEV_TYPE_RAIDZ) == 0);
- VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
- ztest_opts.zo_raidz_parity) == 0);
- VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
- child, r) == 0);
+ VERIFY0(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0));
+ VERIFY0(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE,
+ ztest_opts.zo_raid_type));
+ VERIFY0(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY,
+ ztest_opts.zo_raid_parity));
+ VERIFY0(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN,
+ child, r));
+
+ if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) {
+ uint64_t ndata = ztest_opts.zo_draid_data;
+ uint64_t nparity = ztest_opts.zo_raid_parity;
+ uint64_t nspares = ztest_opts.zo_draid_spares;
+ uint64_t children = ztest_opts.zo_raid_children;
+ uint64_t ngroups = 1;
+
+ /*
+ * Calculate the minimum number of groups required to fill a
+ * slice. This is the LCM of the stripe width (data + parity)
+ * and the number of data drives (children - spares).
+ */
+ while (ngroups * (ndata + nparity) % (children - nspares) != 0)
+ ngroups++;
+
+ /* Store the basic dRAID configuration. */
+ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata);
+ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
+ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
+ }
for (c = 0; c < r; c++)
nvlist_free(child[c]);
umem_free(child, r * sizeof (nvlist_t *));
- return (raidz);
+ return (raid);
}
static nvlist_t *
@@ -1082,12 +1191,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
int c;
if (m < 1)
- return (make_vdev_raidz(path, aux, pool, size, ashift, r));
+ return (make_vdev_raid(path, aux, pool, size, ashift, r));
child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
for (c = 0; c < m; c++)
- child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
+ child[c] = make_vdev_raid(path, aux, pool, size, ashift, r);
VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
@@ -2809,6 +2918,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
if (ztest_opts.zo_mmp_test)
return;
+ /* dRAID added after feature flags, skip upgrade test. */
+ if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0)
+ return;
+
mutex_enter(&ztest_vdev_lock);
name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
@@ -2818,13 +2931,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
(void) spa_destroy(name);
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
- NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+ NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1);
/*
* If we're configuring a RAIDZ device then make sure that the
* initial version is capable of supporting that feature.
*/
- switch (ztest_opts.zo_raidz_parity) {
+ switch (ztest_opts.zo_raid_parity) {
case 0:
case 1:
initial_version = SPA_VERSION_INITIAL;
@@ -2970,7 +3083,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
return;
mutex_enter(&ztest_vdev_lock);
- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
+ ztest_opts.zo_raid_children;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -3024,7 +3138,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
*/
nvroot = make_vdev_root(NULL, NULL, NULL,
ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
- "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+ "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors,
+ 1);
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
@@ -3078,14 +3193,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
return;
}
- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
+ ztest_opts.zo_raid_children;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
spa_config_exit(spa, SCL_VDEV, FTAG);
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
- class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+ class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
@@ -3134,7 +3250,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
char *aux;
char *path;
uint64_t guid = 0;
- int error;
+ int error, ignore_err = 0;
if (ztest_opts.zo_mmp_test)
return;
@@ -3157,7 +3273,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
/*
* Pick a random device to remove.
*/
- guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
+ vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)];
+
+ /* dRAID spares cannot be removed; try anyways to see ENOTSUP */
+ if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL)
+ ignore_err = ENOTSUP;
+
+ guid = svd->vdev_guid;
} else {
/*
* Find an unused device we can add.
@@ -3214,7 +3336,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
case ZFS_ERR_DISCARDING_CHECKPOINT:
break;
default:
- fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+ if (error != ignore_err)
+ fatal(0, "spa_vdev_remove(%llu) = %d", guid,
+ error);
}
}
@@ -3243,7 +3367,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id)
mutex_enter(&ztest_vdev_lock);
/* ensure we have a usable config; mirrors of raidz aren't supported */
- if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
+ if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) {
mutex_exit(&ztest_vdev_lock);
return;
}
@@ -3343,6 +3467,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
int replacing;
int oldvd_has_siblings = B_FALSE;
int newvd_is_spare = B_FALSE;
+ int newvd_is_dspare = B_FALSE;
int oldvd_is_log;
int error, expected_error;
@@ -3353,7 +3478,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
mutex_enter(&ztest_vdev_lock);
- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+ leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
@@ -3393,14 +3518,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (zs->zs_mirrors >= 1) {
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
- oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
+ oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children];
}
/* pick a child out of the raidz group */
- if (ztest_opts.zo_raidz > 1) {
- ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
- ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
- oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
+ if (ztest_opts.zo_raid_children > 1) {
+ if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0)
+ ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+ else
+ ASSERT(oldvd->vdev_ops == &vdev_draid_ops);
+ ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children);
+ oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children];
}
/*
@@ -3447,6 +3575,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (sav->sav_count != 0 && ztest_random(3) == 0) {
newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
newvd_is_spare = B_TRUE;
+
+ if (newvd->vdev_ops == &vdev_draid_spare_ops)
+ newvd_is_dspare = B_TRUE;
+
(void) strcpy(newpath, newvd->vdev_path);
} else {
(void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
@@ -3480,6 +3612,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
* If newvd is already part of the pool, it should fail with EBUSY.
*
* If newvd is too small, it should fail with EOVERFLOW.
+ *
+ * If newvd is a distributed spare and it's being attached to a
+ * dRAID which is not its parent it should fail with EINVAL.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops && (!replacing ||
@@ -3492,10 +3627,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
expected_error = replacing ? 0 : EBUSY;
else if (vdev_lookup_by_path(rvd, newpath) != NULL)
expected_error = EBUSY;
- else if (newsize < oldsize)
+ else if (!newvd_is_dspare && newsize < oldsize)
expected_error = EOVERFLOW;
else if (ashift > oldvd->vdev_top->vdev_ashift)
expected_error = EDOM;
+ else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd))
+ expected_error = ENOTSUP;
else
expected_error = 0;
@@ -4880,13 +5017,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
- VERIFY(0 == dmu_read(os, packobj, packoff,
+ VERIFY0(dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
- VERIFY(0 == dmu_read(os, bigobj, bigoff,
+ VERIFY0(dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
- ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
- ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+ ASSERT0(bcmp(packbuf, packcheck, packsize));
+ ASSERT0(bcmp(bigbuf, bigcheck, bigsize));
umem_free(packcheck, packsize);
umem_free(bigcheck, bigsize);
@@ -5761,7 +5898,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
}
maxfaults = MAXFAULTS(zs);
- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+ leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
mirror_save = zs->zs_mirrors;
mutex_exit(&ztest_vdev_lock);
@@ -6011,7 +6148,7 @@ out:
/*
* By design ztest will never inject uncorrectable damage in to the pool.
* Issue a scrub, wait for it to complete, and verify there is never any
- * any persistent damage.
+ * persistent damage.
*
* Only after a full scrub has been completed is it safe to start injecting
* data corruption. See the comment in zfs_fault_inject().
@@ -7347,7 +7484,7 @@ ztest_init(ztest_shared_t *zs)
zs->zs_splits = 0;
zs->zs_mirrors = ztest_opts.zo_mirrors;
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
- NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+ NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
props = make_random_props();
/*
@@ -7683,10 +7820,12 @@ main(int argc, char **argv)
if (ztest_opts.zo_verbose >= 1) {
(void) printf("%llu vdevs, %d datasets, %d threads,"
- " %llu seconds...\n",
+ "%d %s disks, %llu seconds...\n\n",
(u_longlong_t)ztest_opts.zo_vdevs,
ztest_opts.zo_datasets,
ztest_opts.zo_threads,
+ ztest_opts.zo_raid_children,
+ ztest_opts.zo_raid_type,
(u_longlong_t)ztest_opts.zo_time);
}