diff options
author | Brian Behlendorf <[email protected]> | 2020-11-13 13:51:51 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-11-13 13:51:51 -0800 |
commit | b2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch) | |
tree | 6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /cmd | |
parent | a724db03740133c46b9a577b41a6f7221acd3e1f (diff) |
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Co-authored-by: Matthew Ahrens <[email protected]>
Co-authored-by: Brian Behlendorf <[email protected]>
Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10102
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/raidz_test/raidz_bench.c | 25 | ||||
-rw-r--r-- | cmd/raidz_test/raidz_test.c | 330 | ||||
-rw-r--r-- | cmd/raidz_test/raidz_test.h | 9 | ||||
-rw-r--r-- | cmd/zdb/zdb.c | 10 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_mod.c | 10 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_retire.c | 11 | ||||
-rw-r--r-- | cmd/zfs/zfs_main.c | 130 | ||||
-rw-r--r-- | cmd/zpool/zpool_main.c | 5 | ||||
-rw-r--r-- | cmd/zpool/zpool_vdev.c | 393 | ||||
-rw-r--r-- | cmd/ztest/ztest.c | 281 |
10 files changed, 1011 insertions, 193 deletions
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index 8a2cec4ca..a3446c52c 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -83,8 +83,17 @@ run_gen_bench_impl(const char *impl) /* create suitable raidz_map */ ncols = rto_opts.rto_dcols + fn + 1; zio_bench.io_size = 1ULL << ds; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, fn+1); + + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + rto_opts.rto_ashift, ncols+1, ncols, + fn+1, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + } /* estimate iteration count */ iter_cnt = GEN_BENCH_MEMORY; @@ -163,8 +172,16 @@ run_rec_bench_impl(const char *impl) (1ULL << BENCH_ASHIFT)) continue; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, PARITY_PQR); + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + BENCH_ASHIFT, ncols+1, ncols, + PARITY_PQR, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + } /* estimate iteration count */ iter_cnt = (REC_BENCH_MEMORY); diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 66f36b0d5..4e2639f36 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force) (void) fprintf(stdout, DBLSEP "Running with options:\n" " (-a) zio ashift : %zu\n" " (-o) zio offset : 1 << %zu\n" + " (-e) expanded map : %s\n" + " (-r) reflow offset : %llx\n" " (-d) number of raidz data columns : %zu\n" " (-s) size of DATA : 1 << %zu\n" " (-S) sweep parameters : %s \n" " (-v) verbose : %s \n\n", - opts->rto_ashift, /* -a */ - ilog2(opts->rto_offset), /* -o */ - opts->rto_dcols, /* -d */ - ilog2(opts->rto_dsize), /* -s */ - opts->rto_sweep ? "yes" : "no", /* -S */ - verbose); /* -v */ + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)opts->rto_expand_offset, /* -r */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ } } @@ -104,6 +108,8 @@ static void usage(boolean_t requested) "\t[-S parameter sweep (default: %s)]\n" "\t[-t timeout for parameter sweep test]\n" "\t[-B benchmark all raidz implementations]\n" + "\t[-e use expanded raidz map (default: %s)]\n" + "\t[-r expanded raidz map reflow offset (default: %llx)]\n" "\t[-v increase verbosity (default: %zu)]\n" "\t[-h (print help)]\n" "\t[-T test the test, see if failure would be detected]\n" @@ -114,6 +120,8 @@ static void usage(boolean_t requested) o->rto_dcols, /* -d */ ilog2(o->rto_dsize), /* -s */ rto_opts.rto_sweep ? "yes" : "no", /* -S */ + rto_opts.rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)o->rto_expand_offset, /* -r */ o->rto_v); /* -d */ exit(requested ? 0 : 1); @@ -128,7 +136,7 @@ static void process_options(int argc, char **argv) bcopy(&rto_opts_defaults, o, sizeof (*o)); - while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { value = 0; switch (opt) { @@ -136,6 +144,12 @@ static void process_options(int argc, char **argv) value = strtoull(optarg, NULL, 0); o->rto_ashift = MIN(13, MAX(9, value)); break; + case 'e': + o->rto_expand = 1; + break; + case 'r': + o->rto_expand_offset = strtoull(optarg, NULL, 0); + break; case 'o': value = strtoull(optarg, NULL, 0); o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; @@ -179,25 +193,34 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) -#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) +#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) +#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) -#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) +#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) +#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) static int cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) { - int i, ret = 0; + int r, i, ret = 0; VERIFY(parity >= 1 && parity <= 3); - for (i = 0; i < parity; i++) { - if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) - != 0) { - ret++; - LOG_OPT(D_DEBUG, opts, - "\nParity block [%d] different!\n", i); + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t * const rr = rm->rm_row[r]; + raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; + for (i = 0; i < parity; i++) { + if (CODE_COL_SIZE(rrg, i) == 0) { + VERIFY0(CODE_COL_SIZE(rr, i)); + continue; + } + + if (abd_cmp(CODE_COL(rr, i), + CODE_COL(rrg, i)) != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } } } return (ret); @@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) static int cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) { - int i, ret = 0; - int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + int r, i, dcols, ret = 0; + + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + raidz_row_t *rrg = opts->rm_golden->rm_row[r]; + dcols = opts->rm_golden->rm_row[0]->rr_cols - + raidz_parity(opts->rm_golden); + for (i = 0; i < dcols; i++) { + if (DATA_COL_SIZE(rrg, i) == 0) { + VERIFY0(DATA_COL_SIZE(rr, i)); + continue; + } - for (i = 0; i < dcols; i++) { - if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) - != 0) { - ret++; + if (abd_cmp(DATA_COL(rrg, i), + DATA_COL(rr, i)) != 0) { + ret++; - LOG_OPT(D_DEBUG, opts, - "\nData block [%d] different!\n", i); + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } } } return (ret); @@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private) static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { - int i; - raidz_col_t *col; - - for (i = 0; i < cnt; i++) { - col = &rm->rm_col[tgts[i]]; - abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + for (int i = 0; i < cnt; i++) { + raidz_col_t *col = &rr->rr_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, + init_rand, NULL); + } } } @@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) VERIFY0(vdev_raidz_impl_set("original")); - opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, - opts->rto_ashift, total_ncols, parity); - rm_test = vdev_raidz_map_alloc(zio_test, - opts->rto_ashift, total_ncols, parity); + if (opts->rto_expand) { + opts->rm_golden = + vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, + opts->zio_golden->io_size, opts->zio_golden->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, + zio_test->io_size, zio_test->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + } VERIFY(opts->zio_golden); VERIFY(opts->rm_golden); @@ -312,6 +358,188 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } +/* + * If reflow is not in progress, reflow_offset should be UINT64_MAX. + * For each row, if the row is entirely before reflow_offset, it will + * come from the new location. Otherwise this row will come from the + * old location. Therefore, rows that straddle the reflow_offset will + * come from the old location. + * + * NOTE: Until raidz expansion is implemented this function is only + * needed by raidz_test.c to the multi-row raid_map_t functionality. + */ +raidz_map_t * +vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset) +{ + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t q, r, bc, devidx, asize = 0, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + + for (uint64_t row = 0; row < rows; row++) { + raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, + rr_col[cols]), KM_SLEEP); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and any part of this + * row has not been copied, then use the old location of + * this row. + */ + int row_phys_cols = physical_cols; + if (b + (logical_cols - nparity) > reflow_offset >> ashift) + row_phys_cols--; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * We set cols to the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_cols = cols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + rr->rr_col[c].rc_devidx = child_id; + rr->rr_col[c].rc_offset = child_offset; + rr->rr_col[c].rc_gdata = NULL; + rr->rr_col[c].rc_orig_data = NULL; + rr->rr_col[c].rc_error = 0; + rr->rr_col[c].rc_tried = 0; + rr->rr_col[c].rc_skipped = 0; + rr->rr_col[c].rc_need_orig_restore = B_FALSE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, + B_TRUE); + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end, this for parity generation. + */ + rr->rr_col[c].rc_size = 0; + rr->rr_col[c].rc_abd = NULL; + } else { + /* + * "data column" (col excluding parity) + * Add an ASCII art diagram here + */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_get_offset(abd, off << ashift); + } + + asize += rr->rr_col[c].rc_size; + } + /* + * If all data stored spans all columns, there's a danger that + * parity will always be on the same device and, since parity + * isn't read during normal operation, that that device's I/O + * bandwidth won't be used effectively. We therefore switch + * the parity every 1MB. + * + * ...at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices + * evenly, we won't see any benefit. Further, occasional writes + * that aren't a multiple of the LCM of the number of children + * and the minimum stripe width are sufficient to avoid pessimal + * behavior. Unfortunately, this decision created an implicit + * on-disk format requirement that we need to support for all + * eternity, but only for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for + * padding we must make sure to note this swap. We will never + * intend to skip the first column since at least one data and + * one parity column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + devidx = rr->rr_col[0].rc_devidx; + uint64_t o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; + } + + } + ASSERT3U(asize, ==, tot << ashift); + + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -330,8 +558,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_abd = raidz_alloc(alloc_dsize); init_zio_abd(*zio); - rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, - total_ncols, parity); + if (opts->rto_expand) { + rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, + (*zio)->io_size, (*zio)->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + } VERIFY(rm); /* Make sure code columns are destroyed */ @@ -420,7 +655,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) if (fn < RAIDZ_REC_PQ) { /* can reconstruct 1 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ @@ -445,10 +680,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else if (fn < RAIDZ_REC_PQR) { /* can reconstruct 2 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -475,14 +711,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else { /* can reconstruct 3 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { - if (x2 >= - rm->rm_cols - raidz_parity(rm)) + if (x2 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -700,6 +937,8 @@ run_sweep(void) opts->rto_dcols = dcols_v[d]; opts->rto_offset = (1 << ashift_v[a]) * rand(); opts->rto_dsize = size_v[s]; + opts->rto_expand = rto_opts.rto_expand; + opts->rto_expand_offset = rto_opts.rto_expand_offset; opts->rto_v = 0; /* be quiet */ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, @@ -732,6 +971,7 @@ exit: return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); } + int main(int argc, char **argv) { diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index 09c825ae4..0f7f4cee3 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = { typedef struct raidz_test_opts { size_t rto_ashift; - size_t rto_offset; + uint64_t rto_offset; size_t rto_dcols; size_t rto_dsize; size_t rto_v; size_t rto_sweep; size_t rto_sweep_timeout; size_t rto_benchmark; + size_t rto_expand; + uint64_t rto_expand_offset; size_t rto_sanity; size_t rto_gdb; @@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = { .rto_v = 0, .rto_sweep = 0, .rto_benchmark = 0, + .rto_expand = 0, + .rto_expand_offset = -1ULL, .rto_sanity = 0, .rto_gdb = 0, .rto_should_stop = B_FALSE @@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); +struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + #endif /* RAIDZ_TEST_H */ diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dbf09a652..d4a37dee0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1642,7 +1642,11 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + if (vd->vdev_ops == &vdev_draid_ops) + ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); + else + ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { @@ -5203,8 +5207,6 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - abd_free(zio->io_abd); - mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); @@ -5231,6 +5233,8 @@ zdb_blkptr_done(zio_t *zio) blkbuf); } mutex_exit(&spa->spa_scrub_lock); + + abd_free(zio->io_abd); } static int diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 8190beb0c..4a58e1f1d 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + /* + * Prefer sequential resilvering when supported (mirrors and dRAID), + * otherwise fallback to a traditional healing resilver. + */ + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); + if (ret != 0) { + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, + B_TRUE, B_FALSE); + } zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index ba8a6de3a..89bb84e48 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) * replace it. */ for (s = 0; s < nspares; s++) { - char *spare_name; + boolean_t rebuild = B_FALSE; + char *spare_name, *type; if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, &spare_name) != 0) continue; + /* prefer sequential resilvering for distributed spares */ + if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, + &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + rebuild = B_TRUE; + /* if set, add the "ashift" pool property to the spare nvlist */ if (source != ZPROP_SRC_DEFAULT) (void) nvlist_add_uint64(spares[s], @@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) dev_name, basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE, B_FALSE) == 0) { + replacement, B_TRUE, rebuild) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); @@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * Attempt to substitute a hot spare. */ (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index f609a4e70..340a7db96 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -893,6 +893,107 @@ usage: } /* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + nvlist_t *tree, **vdevs; + uint_t nvdevs; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (ZVOL_DEFAULT_BLOCKSIZE); + } + + for (int i = 0; i < nvdevs; i++) { + nvlist_t *nv = vdevs[i]; + uint64_t ashift, ndata, nparity; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, + &ndata) == 0) { + /* dRAID minimum allocation width */ + asize = MAX(asize, ndata * (1ULL << ashift)); + } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + /* raidz minimum allocation width */ + if (nparity == 1) + asize = MAX(asize, 2 * (1ULL << ashift)); + else + asize = MAX(asize, 4 * (1ULL << ashift)); + } else { + /* mirror or (non-redundant) leaf vdev */ + asize = MAX(asize, 1ULL << ashift); + } + } + + /* + * Calculate the target volblocksize such that more than half + * of the asize is used. The following table is for 4k sectors. + * + * n asize blksz used | n asize blksz used + * -------------------------+--------------------------------- + * 1 4,096 8,192 100% | 9 36,864 32,768 88% + * 2 8,192 8,192 100% | 10 40,960 32,768 80% + * 3 12,288 8,192 66% | 11 45,056 32,768 72% + * 4 16,384 16,384 100% | 12 49,152 32,768 66% + * 5 20,480 16,384 80% | 13 53,248 32,768 61% + * 6 24,576 16,384 66% | 14 57,344 32,768 57% + * 7 28,672 16,384 57% | 15 61,440 32,768 53% + * 8 32,768 32,768 100% | 16 65,536 65,636 100% + * + * This is primarily a concern for dRAID which always allocates + * a full stripe width. For dRAID the default stripe width is + * n=8 in which case the volblocksize is set to 32k. Ignoring + * compression there are no unused sectors. This same reasoning + * applies to raidz[2,3] so target 4 sectors to minimize waste. + */ + uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + while (tgt_volblocksize * 2 <= asize) + tgt_volblocksize *= 2; + + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { + + /* Issue a warning when a non-optimal size is requested. */ + if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is less than the default " + "minimum block size (%llu).\nTo reduce wasted " + "space a volblocksize of %llu is recommended.\n"), + (u_longlong_t)volblocksize, + (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, + (u_longlong_t)tgt_volblocksize); + } else if (volblocksize < tgt_volblocksize) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is much less than the " + "minimum allocation\nunit (%llu), which wastes " + "at least %llu%% of space. To reduce wasted " + "space,\nuse a larger volblocksize (%llu is " + "recommended), fewer dRAID data disks\n" + "per group, or smaller sector size (ashift).\n"), + (u_longlong_t)volblocksize, (u_longlong_t)asize, + (u_longlong_t)((100 * (asize - volblocksize)) / + asize), (u_longlong_t)tgt_volblocksize); + } + } else { + volblocksize = tgt_volblocksize; + fnvlist_add_uint64(props, prop, volblocksize); + } + + return (volblocksize); +} + +/* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * @@ -932,6 +1033,7 @@ zfs_do_create(int argc, char **argv) int ret = 1; nvlist_t *props; uint64_t intval; + char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); @@ -1018,7 +1120,7 @@ zfs_do_create(int argc, char **argv) goto badusage; } - if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) { + if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; @@ -1040,18 +1142,24 @@ zfs_do_create(int argc, char **argv) } } - /* - * if volsize is not a multiple of volblocksize, round it up to the - * nearest multiple of the volblocksize - */ if (type == ZFS_TYPE_VOLUME) { - uint64_t volblocksize; + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + uint64_t volblocksize = default_volblocksize(zpool_handle, + real_props); - if (nvlist_lookup_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize) != 0) - volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && + nvlist_lookup_string(props, prop, &strval) != 0) { + if (asprintf(&strval, "%llu", + (u_longlong_t)volblocksize) == -1) + nomem(); + nvlist_add_string(props, prop, strval); + free(strval); + } + /* + * If volsize is not a multiple of volblocksize, round it + * up to the nearest multiple of the volblocksize. + */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); @@ -1064,11 +1172,9 @@ zfs_do_create(int argc, char **argv) } } - if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; - char *strval; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 83a9b5a5a..524cff335 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2294,7 +2294,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } - /* Display vdev initialization and trim status for leaves */ + /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); @@ -9849,7 +9849,8 @@ vdev_any_spare_replacing(nvlist_t *nv) (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || - strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) { + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || + strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 9aa09b18c..c86081a81 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -86,9 +86,6 @@ boolean_t error_seen; boolean_t is_force; - - - /*PRINTFLIKE1*/ void vdev_error(const char *fmt, ...) @@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path) uint_t i, nspares; boolean_t inuse; + if (zpool_is_draid_spare(path)) + return (B_TRUE); + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); @@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for <zfs_vdev_paths>/xxx + * draid* Virtual dRAID spare */ static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; @@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (zpool_is_draid_spare(arg)) { + if (!is_primary) { + (void) fprintf(stderr, + gettext("cannot open '%s': dRAID spares can only " + "be used to replace primary vdevs\n"), arg); + return (NULL); + } + + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -432,11 +443,16 @@ typedef struct replication_level { #define ZPOOL_FUZZ (16 * 1024 * 1024) +/* + * N.B. For the purposes of comparing replication levels dRAID can be + * considered functionally equivilant to raidz. + */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { - if (strcmp(a->zprl_type, "raidz") == 0 && + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; @@ -446,6 +462,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, } /* + * Comparison for determining if dRAID and raidz where passed in either order. + */ +static boolean_t +is_raidz_draid(replication_level_t *a, replication_level_t *b) +{ + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && + (strcmp(b->zprl_type, "raidz") == 0 || + strcmp(b->zprl_type, "draid") == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. @@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) else return (NULL); } + } else if (is_raidz_draid(&lastrep, &rep)) { + /* + * Accepted raidz and draid when they can + * handle the same number of disk failures. + */ + if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s and %s vdevs " + "with different " + "redundancy, %llu vs. " + "%llu are present\n"), + lastrep.zprl_type, + rep.zprl_type, + lastrep.zprl_parity, + rep.zprl_parity); + else + return (NULL); + } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) @@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, return (anyinuse); } -static const char * -is_grouping(const char *type, int *mindev, int *maxdev) +/* + * Returns the parity level extracted from a raidz or draid type. + * If the parity cannot be determined zero is returned. + */ +static int +get_parity(const char *type) { - if (strncmp(type, "raidz", 5) == 0) { - const char *p = type + 5; - char *end; - long nparity; + long parity = 0; + const char *p; + + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { + p = type + strlen(VDEV_TYPE_RAIDZ); if (*p == '\0') { - nparity = 1; + /* when unspecified default to single parity */ + return (1); } else if (*p == '0') { - return (NULL); /* no zero prefixes allowed */ + /* no zero prefixes allowed */ + return (0); } else { + /* 0-3, no suffixes allowed */ + char *end; errno = 0; - nparity = strtol(p, &end, 10); - if (errno != 0 || nparity < 1 || nparity >= 255 || - *end != '\0') - return (NULL); + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || + parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { + return (0); + } + } + } else if (strncmp(type, VDEV_TYPE_DRAID, + strlen(VDEV_TYPE_DRAID)) == 0) { + p = type + strlen(VDEV_TYPE_DRAID); + + if (*p == '\0' || *p == ':') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, allowed suffixes: '\0' or ':' */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || + parity < 1 || parity > VDEV_DRAID_MAXPARITY || + (*end != '\0' && *end != ':')) { + return (0); + } } + } + + return ((int)parity); +} + +/* + * Assign the minimum and maximum number of devices allowed for + * the specified type. On error NULL is returned, otherwise the + * type prefix is returned (raidz, mirror, etc). + */ +static const char * +is_grouping(const char *type, int *mindev, int *maxdev) +{ + int nparity; + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { + nparity = get_parity(type); + if (nparity == 0) + return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + + if (strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0) { + return (VDEV_TYPE_RAIDZ); + } else { + return (VDEV_TYPE_DRAID); + } } if (maxdev != NULL) @@ -1168,6 +1280,163 @@ is_grouping(const char *type, int *mindev, int *maxdev) } /* + * Extract the configuration parameters encoded in the dRAID type and + * use them to generate a dRAID configuration. The expected format is: + * + * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] + * + * The intent is to be able to generate a good configuration when no + * additional information is provided. The only mandatory component + * of the 'type' is the 'draid' prefix. If a value is not provided + * then reasonable defaults are used. The optional components may + * appear in any order but the d/s/c suffix is required. + * + * Valid inputs: + * - data: number of data devices per group (1-255) + * - parity: number of parity blocks per group (1-3) + * - spares: number of distributed spare (0-100) + * - children: total number of devices (1-255) + * + * Examples: + * - zpool create tank draid <devices...> + * - zpool create tank draid2:8d:51c:2s <devices...> + */ +static int +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +{ + uint64_t nparity = 1; + uint64_t nspares = 0; + uint64_t ndata = UINT64_MAX; + uint64_t ngroups = 1; + long value; + + if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + if (nparity == 0) + return (EINVAL); + + char *p = (char *)type; + while ((p = strchr(p, ':')) != NULL) { + char *end; + + p = p + 1; + errno = 0; + + if (!isdigit(p[0])) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:<number><c|d|s>] not '%s'\n"), + type); + return (EINVAL); + } + + /* Expected non-zero value with c/d/s suffix */ + value = strtol(p, &end, 10); + char suffix = tolower(*end); + if (errno != 0 || + (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:<number><c|d|s>] not '%s'\n"), + type); + return (EINVAL); + } + + if (suffix == 'c') { + if ((uint64_t)value != children) { + fprintf(stderr, + gettext("invalid number of dRAID children; " + "%llu required but %llu provided\n"), + (u_longlong_t)value, + (u_longlong_t)children); + return (EINVAL); + } + } else if (suffix == 'd') { + ndata = (uint64_t)value; + } else if (suffix == 's') { + nspares = (uint64_t)value; + } else { + verify(0); /* Unreachable */ + } + } + + /* + * When a specific number of data disks is not provided limit a + * redundancy group to 8 data disks. This value was selected to + * provide a reasonable tradeoff between capacity and performance. + */ + if (ndata == UINT64_MAX) { + if (children > nspares + nparity) { + ndata = MIN(children - nspares - nparity, 8); + } else { + fprintf(stderr, gettext("request number of " + "distributed spares %llu and parity level %llu\n" + "leaves no disks available for data\n"), + (u_longlong_t)nspares, (u_longlong_t)nparity); + return (EINVAL); + } + } + + /* Verify the maximum allowed group size is never exceeded. */ + if (ndata == 0 || (ndata + nparity > children - nspares)) { + fprintf(stderr, gettext("requested number of dRAID data " + "disks per group %llu is too high,\nat most %llu disks " + "are available for data\n"), (u_longlong_t)ndata, + (u_longlong_t)(children - nspares - nparity)); + return (EINVAL); + } + + if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + fprintf(stderr, + gettext("invalid dRAID parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_DRAID_MAXPARITY); + return (EINVAL); + } + + /* + * Verify the requested number of spares can be satisfied. + * An arbitrary limit of 100 distributed spares is applied. + */ + if (nspares > 100 || nspares > (children - (ndata + nparity))) { + fprintf(stderr, + gettext("invalid number of dRAID spares %llu; additional " + "disks would be required\n"), (u_longlong_t)nspares); + return (EINVAL); + } + + /* Verify the requested number children is sufficient. */ + if (children < (ndata + nparity + nspares)) { + fprintf(stderr, gettext("%llu disks were provided, but at " + "least %llu disks are required for this config\n"), + (u_longlong_t)children, + (u_longlong_t)(ndata + nparity + nspares)); + } + + if (children > VDEV_DRAID_MAX_CHILDREN) { + fprintf(stderr, gettext("%llu disks were provided, but " + "dRAID only supports up to %u disks"), + (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); + } + + /* + * Calculate the minimum number of groups required to fill a slice. + * This is the LCM of the stripe width (ndata + nparity) and the + * number of data drives (children - nspares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + + return (0); +} + +/* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. * Note: we don't bother freeing anything in the error paths @@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; - const char *type; - uint64_t is_log, is_special, is_dedup; + const char *type, *fulltype; + boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; @@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { + fulltype = argv[0]; nv = NULL; /* - * If it's a mirror or raidz, the subsequent arguments are - * its leaves -- until we encounter the next mirror or raidz. + * If it's a mirror, raidz, or draid the subsequent arguments + * are its leaves -- until we encounter the next mirror, + * raidz or draid. */ - if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } + is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } @@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; - is_special = B_FALSE; - is_dedup = B_FALSE; + is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* @@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; - is_log = B_FALSE; - is_dedup = B_FALSE; + is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; @@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; - is_log = B_FALSE; - is_special = B_FALSE; + is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; @@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = B_FALSE; + is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { @@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], - B_FALSE)) == NULL) { + !(is_log || is_special || is_dedup || + is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); @@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) + if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); + } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, @@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_DRAID) == 0) { + if (draid_config_by_type(nv, + fulltype, children) != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, children) == 0); @@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ - if ((nv = make_leaf_vdev(props, argv[0], - is_log)) == NULL) + if ((nv = make_leaf_vdev(props, argv[0], !(is_log || + is_special || is_dedup || is_spare))) == NULL) goto spec_out; - if (is_log) + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; + } + if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 31205a5bf..1c4da20e4 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -104,6 +104,7 @@ #include <sys/zio.h> #include <sys/zil.h> #include <sys/zil_impl.h> +#include <sys/vdev_draid.h> #include <sys/vdev_impl.h> #include <sys/vdev_file.h> #include <sys/vdev_initialize.h> @@ -167,8 +168,11 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; - int zo_raidz; - int zo_raidz_parity; + int zo_raid_children; + int zo_raid_parity; + char zo_raid_type[8]; + int zo_draid_data; + int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; @@ -191,9 +195,12 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, - .zo_raidz = 4, - .zo_raidz_parity = 1, + .zo_raid_children = 4, + .zo_raid_parity = 1, + .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_draid_data = 4, /* data drives */ + .zo_draid_spares = 1, /* distributed spares */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ @@ -232,7 +239,7 @@ static ztest_shared_ds_t *ztest_shared_ds; #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ - (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -689,8 +696,11 @@ usage(boolean_t requested) "\t[-s size_of_each_vdev (default: %s)]\n" "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" - "\t[-r raidz_disks (default: %d)]\n" - "\t[-R raidz_parity (default: %d)]\n" + "\t[-r raidz_disks / draid_disks (default: %d)]\n" + "\t[-R raid_parity (default: %d)]\n" + "\t[-K raid_kind (default: random)] raidz|draid|random\n" + "\t[-D draid_data (default: %d)] in config\n" + "\t[-S draid_spares (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" @@ -716,8 +726,10 @@ usage(boolean_t requested) nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ - zo->zo_raidz, /* -r */ - zo->zo_raidz_parity, /* -R */ + zo->zo_raid_children, /* -r */ + zo->zo_raid_parity, /* -R */ + zo->zo_draid_data, /* -D */ + zo->zo_draid_spares, /* -S */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ @@ -731,6 +743,21 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) @@ -780,11 +807,12 @@ process_options(int argc, char **argv) int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; + char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + "v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { value = 0; switch (opt) { case 'v': @@ -793,6 +821,8 @@ process_options(int argc, char **argv) case 'm': case 'r': case 'R': + case 'D': + case 'S': case 'd': case 't': case 'g': @@ -817,10 +847,19 @@ process_options(int argc, char **argv) zo->zo_mirrors = value; break; case 'r': - zo->zo_raidz = MAX(1, value); + zo->zo_raid_children = MAX(1, value); break; case 'R': - zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raid_parity = MIN(MAX(value, 1), 3); + break; + case 'K': + (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); + break; + case 'D': + zo->zo_draid_data = MAX(1, value); + break; + case 'S': + zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); @@ -895,7 +934,54 @@ process_options(int argc, char **argv) } } - zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + /* When raid choice is 'random' add a draid pool 50% of the time */ + if (strcmp(raid_kind, "random") == 0) { + (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? + "draid" : "raidz", sizeof (raid_kind)); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("choosing RAID type '%s'\n", raid_kind); + } + + if (strcmp(raid_kind, "draid") == 0) { + uint64_t min_devsize; + + /* With fewer disk use 256M, otherwise 128M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (256ULL << 20) : (128ULL << 20); + + /* No top-level mirrors with dRAID for now */ + zo->zo_mirrors = 0; + + /* Use more appropriate defaults for dRAID */ + if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) + zo->zo_vdevs = 1; + if (zo->zo_raid_children == + ztest_opts_defaults.zo_raid_children) + zo->zo_raid_children = 16; + if (zo->zo_ashift < 12) + zo->zo_ashift = 12; + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; + + if (zo->zo_draid_data + zo->zo_raid_parity > + zo->zo_raid_children - zo->zo_draid_spares) { + (void) fprintf(stderr, "error: too few draid " + "children (%d) for stripe width (%d)\n", + zo->zo_raid_children, + zo->zo_draid_data + zo->zo_raid_parity); + usage(B_FALSE); + } + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, + sizeof (zo->zo_raid_type)); + + } else /* using raidz */ { + ASSERT0(strcmp(raid_kind, "raidz")); + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : @@ -966,22 +1052,6 @@ ztest_kill(ztest_shared_t *zs) (void) kill(getpid(), SIGKILL); } -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - ASSERT3S(ztest_fd_rand, >=, 0); - - if (range == 0) - return (0); - - if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - /* ARGSUSED */ static void ztest_record_enospc(const char *s) @@ -997,12 +1067,27 @@ ztest_get_ashift(void) return (ztest_opts.zo_ashift); } +static boolean_t +ztest_is_draid_spare(const char *name) +{ + uint64_t spare_id = 0, parity = 0, vdev_id = 0; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; nvlist_t *file; + boolean_t draid_spare = B_FALSE; pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -1024,9 +1109,11 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } + } else { + draid_spare = ztest_is_draid_spare(path); } - if (size != 0) { + if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(1, "can't open %s", path); @@ -1035,20 +1122,21 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) (void) close(fd); } - VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); - VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + VERIFY0(nvlist_alloc(&file, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, + draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE)); + VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path)); + VERIFY0(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift)); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * -make_vdev_raidz(char *path, char *aux, char *pool, size_t size, +make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { - nvlist_t *raidz, **child; + nvlist_t *raid, **child; int c; if (r < 2) @@ -1058,20 +1146,41 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size, for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); - VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_RAIDZ) == 0); - VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - ztest_opts.zo_raidz_parity) == 0); - VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, - child, r) == 0); + VERIFY0(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE, + ztest_opts.zo_raid_type)); + VERIFY0(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raid_parity)); + VERIFY0(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, + child, r)); + + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata = ztest_opts.zo_draid_data; + uint64_t nparity = ztest_opts.zo_raid_parity; + uint64_t nspares = ztest_opts.zo_draid_spares; + uint64_t children = ztest_opts.zo_raid_children; + uint64_t ngroups = 1; + + /* + * Calculate the minimum number of groups required to fill a + * slice. This is the LCM of the stripe width (data + parity) + * and the number of data drives (children - spares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + } for (c = 0; c < r; c++) nvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); - return (raidz); + return (raid); } static nvlist_t * @@ -1082,12 +1191,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, int c; if (m < 1) - return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, @@ -2809,6 +2918,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_mmp_test) return; + /* dRAID added after feature flags, skip upgrade test. */ + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + return; + mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); @@ -2818,13 +2931,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ - switch (ztest_opts.zo_raidz_parity) { + switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; @@ -2970,7 +3083,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3024,7 +3138,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3078,14 +3193,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3134,7 +3250,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) char *aux; char *path; uint64_t guid = 0; - int error; + int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; @@ -3157,7 +3273,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) /* * Pick a random device to remove. */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + + /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ + if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) + ignore_err = ENOTSUP; + + guid = svd->vdev_guid; } else { /* * Find an unused device we can add. @@ -3214,7 +3336,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: - fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + if (error != ignore_err) + fatal(0, "spa_vdev_remove(%llu) = %d", guid, + error); } } @@ -3243,7 +3367,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } @@ -3343,6 +3467,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; + int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; @@ -3353,7 +3478,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3393,14 +3518,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ - if (ztest_opts.zo_raidz > 1) { - ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + if (ztest_opts.zo_raid_children > 1) { + if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + else + ASSERT(oldvd->vdev_ops == &vdev_draid_ops); + ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* @@ -3447,6 +3575,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; + + if (newvd->vdev_ops == &vdev_draid_spare_ops) + newvd_is_dspare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, @@ -3480,6 +3612,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. + * + * If newvd is a distributed spare and it's being attached to a + * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3492,10 +3627,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; - else if (newsize < oldsize) + else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; + else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) + expected_error = ENOTSUP; else expected_error = 0; @@ -4880,13 +5017,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, packobj, packoff, + VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, + VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + ASSERT0(bcmp(packbuf, packcheck, packsize)); + ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); @@ -5761,7 +5898,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); @@ -6011,7 +6148,7 @@ out: /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any - * any persistent damage. + * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). @@ -7347,7 +7484,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* @@ -7683,10 +7820,12 @@ main(int argc, char **argv) if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", + "%d %s disks, %llu seconds...\n\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, + ztest_opts.zo_raid_children, + ztest_opts.zo_raid_type, (u_longlong_t)ztest_opts.zo_time); } |