diff options
author | Brian Behlendorf <[email protected]> | 2020-11-13 13:51:51 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-11-13 13:51:51 -0800 |
commit | b2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch) | |
tree | 6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /lib | |
parent | a724db03740133c46b9a577b41a6f7221acd3e1f (diff) |
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Co-authored-by: Matthew Ahrens <[email protected]>
Co-authored-by: Brian Behlendorf <[email protected]>
Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10102
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libzfs/libzfs_dataset.c | 93 | ||||
-rw-r--r-- | lib/libzfs/libzfs_import.c | 1 | ||||
-rw-r--r-- | lib/libzfs/libzfs_pool.c | 85 | ||||
-rw-r--r-- | lib/libzpool/Makefile.am | 4 |
4 files changed, 155 insertions, 28 deletions
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 1eaed435c..47418b323 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5336,6 +5336,16 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * + * The situtation is slightly different for dRAID since the minimum allocation + * size is the full group width. The same 8K block above would be written as + * follows in a dRAID group: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | S1 | + * +-------+-------+-------+-------+-------+ + * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. @@ -5366,6 +5376,23 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, } /* + * Derived from function of same name in module/zfs/vdev_draid.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. + */ +static uint64_t +vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + ASSERT3U(ndisks, >, nparity); + uint64_t ndata = ndisks - nparity; + uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; + uint64_t asize = (rows * ndisks) << ashift; + + return (asize); +} + +/* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one * copy of the volume data. See theory comment above. @@ -5374,7 +5401,7 @@ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; - uint_t nvdevs, v; + uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); @@ -5384,33 +5411,61 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) return (nblocks * blksize); } - for (v = 0; v < nvdevs; v++) { + for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; - nvlist_t **disks; - uint_t ndisks; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, - &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, - &nparity) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, - &ashift) != 0 || - nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, - &disks, &ndisks) != 0) { + &type) != 0) + continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && + strcmp(type, VDEV_TYPE_DRAID) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_NPARITY, &nparity) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; - } - /* allocation size for the "typical" 128k block */ - tsize = vdev_raidz_asize(ndisks, nparity, ashift, - SPA_OLD_MAXBLOCKSIZE); - /* allocation size for the blksize block */ - asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + nvlist_t **disks; + uint_t ndisks; + + if (nvlist_lookup_nvlist_array(vdevs[v], + ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, + blksize); + } else { + uint64_t ndata; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_draid_asize(ndata + nparity, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_draid_asize(ndata + nparity, nparity, + ashift, blksize); + } /* - * Scale this size down as a ratio of 128k / tsize. See theory - * statement above. + * Scale this size down as a ratio of 128k / tsize. + * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 6c5f61836..44d3ade49 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -112,7 +112,6 @@ refresh_config_libzfs(void *handle, nvlist_t *tryconfig) return (refresh_config((libzfs_handle_t *)handle, tryconfig)); } - static int pool_active_libzfs(void *handle, const char *name, uint64_t guid, boolean_t *isactive) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 00b0b6faf..16f8e3e7f 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -42,10 +42,10 @@ #include <sys/efi_partition.h> #include <sys/systeminfo.h> #include <sys/zfs_ioctl.h> +#include <sys/zfs_sysfs.h> #include <sys/vdev_disk.h> #include <dlfcn.h> #include <libzutil.h> - #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" @@ -481,7 +481,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid feature '%s'"), fname); + "feature '%s' unsupported by kernel"), + fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } @@ -960,6 +961,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -1187,6 +1189,37 @@ zpool_has_special_vdev(nvlist_t *nvroot) } /* + * Output a dRAID top-level vdev name in to the provided buffer. + */ +static char * +zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, + uint64_t spares, uint64_t children) +{ + snprintf(name, len, "%s%llu:%llud:%lluc:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)spares); + + return (name); +} + +/* + * Return B_TRUE if the provided name is a dRAID spare name. + */ +boolean_t +zpool_is_draid_spare(const char *name) +{ + uint64_t spare_id, parity, vdev_id; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we * don't have to worry about error semantics. @@ -2668,6 +2701,11 @@ zpool_vdev_is_interior(const char *name) VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); + + if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && + !zpool_is_draid_spare(name)) + return (B_TRUE); + return (B_FALSE); } @@ -3101,7 +3139,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_SPARE) == 0 && + if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || + strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); @@ -3216,8 +3255,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, "cannot replace a log with a spare")); } else if (rebuild) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only mirror vdevs support sequential " - "reconstruction")); + "only mirror and dRAID vdevs support " + "sequential reconstruction")); + } else if (zpool_is_draid_spare(new_disk)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares can only replace child " + "devices in their parent's dRAID vdev")); } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " @@ -3618,6 +3661,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + if (zpool_is_draid_spare(path)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares cannot be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) @@ -3955,9 +4004,10 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, } /* - * Remove the partition from the path it this is a whole disk. + * Remove the partition from the path if this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -3976,6 +4026,27 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, } /* + * If it's a dRAID device, we add parity, groups, and spares. + */ + if (strcmp(path, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata, nparity, nspares; + nvlist_t **child; + uint_t children; + + verify(nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, &nparity) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0); + + path = zpool_draid_name(buf, sizeof (buf), ndata, + nparity, nspares, children); + } + + /* * We identify each top-level vdev by using a <type-id> * naming convention. */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index d427bda36..5b938bd4a 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -124,6 +124,8 @@ KERNEL_C = \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ @@ -216,7 +218,7 @@ libzpool_la_LIBADD = \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ $(abs_top_builddir)/lib/libzstd/libzstd.la -libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl +libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm libzpool_la_LDFLAGS = -pthread |