aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2020-11-13 13:51:51 -0800
committerGitHub <[email protected]>2020-11-13 13:51:51 -0800
commitb2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch)
tree6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /lib
parenta724db03740133c46b9a577b41a6f7221acd3e1f (diff)
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create <pool> draid[1,2,3] <vdevs...> Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create <pool> \ draid[<parity>][:<data>d][:<children>c][:<spares>s] \ <vdevs...> - draid[parity] - Parity level (default 1) - draid[:<data>d] - Data devices per group (default 8) - draid[:<children>c] - Expected number of child vdevs - draid[:<spares>s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D <value> - dRAID data drives per group -S <value> - dRAID distributed hot spares -R <value> - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Don Brady <[email protected]> Co-authored-by: Matthew Ahrens <[email protected]> Co-authored-by: Brian Behlendorf <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #10102
Diffstat (limited to 'lib')
-rw-r--r--lib/libzfs/libzfs_dataset.c93
-rw-r--r--lib/libzfs/libzfs_import.c1
-rw-r--r--lib/libzfs/libzfs_pool.c85
-rw-r--r--lib/libzpool/Makefile.am4
4 files changed, 155 insertions, 28 deletions
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 1eaed435c..47418b323 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -5336,6 +5336,16 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
* 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
* the 128k block example above.
*
+ * The situtation is slightly different for dRAID since the minimum allocation
+ * size is the full group width. The same 8K block above would be written as
+ * follows in a dRAID group:
+ *
+ * +-------+-------+-------+-------+-------+
+ * | disk1 | disk2 | disk3 | disk4 | disk5 |
+ * +-------+-------+-------+-------+-------+
+ * | P0 | D0 | D1 | S0 | S1 |
+ * +-------+-------+-------+-------+-------+
+ *
* Compression may lead to a variety of block sizes being written for the same
* volume or file. There is no clear way to reserve just the amount of space
* that will be required, so the worst case (no compression) is assumed.
@@ -5366,6 +5376,23 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
}
/*
+ * Derived from function of same name in module/zfs/vdev_draid.c. Returns the
+ * amount of space (in bytes) that will be allocated for the specified block
+ * size.
+ */
+static uint64_t
+vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
+ uint64_t blksize)
+{
+ ASSERT3U(ndisks, >, nparity);
+ uint64_t ndata = ndisks - nparity;
+ uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1;
+ uint64_t asize = (rows * ndisks) << ashift;
+
+ return (asize);
+}
+
+/*
* Determine how much space will be allocated if it lands on the most space-
* inefficient top-level vdev. Returns the size in bytes required to store one
* copy of the volume data. See theory comment above.
@@ -5374,7 +5401,7 @@ static uint64_t
volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
{
nvlist_t *config, *tree, **vdevs;
- uint_t nvdevs, v;
+ uint_t nvdevs;
uint64_t ret = 0;
config = zpool_get_config(zhp, NULL);
@@ -5384,33 +5411,61 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
return (nblocks * blksize);
}
- for (v = 0; v < nvdevs; v++) {
+ for (int v = 0; v < nvdevs; v++) {
char *type;
uint64_t nparity, ashift, asize, tsize;
- nvlist_t **disks;
- uint_t ndisks;
uint64_t volsize;
if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
- &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
- nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
- &nparity) != 0 ||
- nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
- &ashift) != 0 ||
- nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
- &disks, &ndisks) != 0) {
+ &type) != 0)
+ continue;
+
+ if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 &&
+ strcmp(type, VDEV_TYPE_DRAID) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(vdevs[v],
+ ZPOOL_CONFIG_NPARITY, &nparity) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(vdevs[v],
+ ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
continue;
- }
- /* allocation size for the "typical" 128k block */
- tsize = vdev_raidz_asize(ndisks, nparity, ashift,
- SPA_OLD_MAXBLOCKSIZE);
- /* allocation size for the blksize block */
- asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);
+ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+ nvlist_t **disks;
+ uint_t ndisks;
+
+ if (nvlist_lookup_nvlist_array(vdevs[v],
+ ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0)
+ continue;
+
+ /* allocation size for the "typical" 128k block */
+ tsize = vdev_raidz_asize(ndisks, nparity, ashift,
+ SPA_OLD_MAXBLOCKSIZE);
+
+ /* allocation size for the blksize block */
+ asize = vdev_raidz_asize(ndisks, nparity, ashift,
+ blksize);
+ } else {
+ uint64_t ndata;
+
+ if (nvlist_lookup_uint64(vdevs[v],
+ ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0)
+ continue;
+
+ /* allocation size for the "typical" 128k block */
+ tsize = vdev_draid_asize(ndata + nparity, nparity,
+ ashift, SPA_OLD_MAXBLOCKSIZE);
+
+ /* allocation size for the blksize block */
+ asize = vdev_draid_asize(ndata + nparity, nparity,
+ ashift, blksize);
+ }
/*
- * Scale this size down as a ratio of 128k / tsize. See theory
- * statement above.
+ * Scale this size down as a ratio of 128k / tsize.
+ * See theory statement above.
*/
volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
if (volsize > ret) {
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 6c5f61836..44d3ade49 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -112,7 +112,6 @@ refresh_config_libzfs(void *handle, nvlist_t *tryconfig)
return (refresh_config((libzfs_handle_t *)handle, tryconfig));
}
-
static int
pool_active_libzfs(void *handle, const char *name, uint64_t guid,
boolean_t *isactive)
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 00b0b6faf..16f8e3e7f 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -42,10 +42,10 @@
#include <sys/efi_partition.h>
#include <sys/systeminfo.h>
#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
#include <sys/vdev_disk.h>
#include <dlfcn.h>
#include <libzutil.h>
-
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "libzfs_impl.h"
@@ -481,7 +481,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
if (err != 0) {
ASSERT3U(err, ==, ENOENT);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "invalid feature '%s'"), fname);
+ "feature '%s' unsupported by kernel"),
+ fname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
@@ -960,6 +961,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
if (ret == 0 && !isopen &&
(strncmp(pool, "mirror", 6) == 0 ||
strncmp(pool, "raidz", 5) == 0 ||
+ strncmp(pool, "draid", 5) == 0 ||
strncmp(pool, "spare", 5) == 0 ||
strcmp(pool, "log") == 0)) {
if (hdl != NULL)
@@ -1187,6 +1189,37 @@ zpool_has_special_vdev(nvlist_t *nvroot)
}
/*
+ * Output a dRAID top-level vdev name in to the provided buffer.
+ */
+static char *
+zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity,
+ uint64_t spares, uint64_t children)
+{
+ snprintf(name, len, "%s%llu:%llud:%lluc:%llus",
+ VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data,
+ (u_longlong_t)children, (u_longlong_t)spares);
+
+ return (name);
+}
+
+/*
+ * Return B_TRUE if the provided name is a dRAID spare name.
+ */
+boolean_t
+zpool_is_draid_spare(const char *name)
+{
+ uint64_t spare_id, parity, vdev_id;
+
+ if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu",
+ (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id,
+ (u_longlong_t *)&spare_id) == 3) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* Create the named pool, using the provided vdev list. It is assumed
* that the consumer has already validated the contents of the nvlist, so we
* don't have to worry about error semantics.
@@ -2668,6 +2701,11 @@ zpool_vdev_is_interior(const char *name)
VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 ||
strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
return (B_TRUE);
+
+ if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 &&
+ !zpool_is_draid_spare(name))
+ return (B_TRUE);
+
return (B_FALSE);
}
@@ -3101,7 +3139,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
&type) == 0);
- if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
+ if ((strcmp(type, VDEV_TYPE_SPARE) == 0 ||
+ strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) &&
children == 2 && child[which] == tgt)
return (B_TRUE);
@@ -3216,8 +3255,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
"cannot replace a log with a spare"));
} else if (rebuild) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "only mirror vdevs support sequential "
- "reconstruction"));
+ "only mirror and dRAID vdevs support "
+ "sequential reconstruction"));
+ } else if (zpool_is_draid_spare(new_disk)) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "dRAID spares can only replace child "
+ "devices in their parent's dRAID vdev"));
} else if (version >= SPA_VERSION_MULTI_REPLACE) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"already in replacing/spare config; wait "
@@ -3618,6 +3661,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
+ if (zpool_is_draid_spare(path)) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "dRAID spares cannot be removed"));
+ return (zfs_error(hdl, EZFS_NODEVICE, msg));
+ }
+
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
&islog)) == NULL)
@@ -3955,9 +4004,10 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
}
/*
- * Remove the partition from the path it this is a whole disk.
+ * Remove the partition from the path if this is a whole disk.
*/
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
+ if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 &&
+ nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
== 0 && value && !(name_flags & VDEV_NAME_PATH)) {
return (zfs_strip_partition(path));
}
@@ -3976,6 +4026,27 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
}
/*
+ * If it's a dRAID device, we add parity, groups, and spares.
+ */
+ if (strcmp(path, VDEV_TYPE_DRAID) == 0) {
+ uint64_t ndata, nparity, nspares;
+ nvlist_t **child;
+ uint_t children;
+
+ verify(nvlist_lookup_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+ verify(nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_NPARITY, &nparity) == 0);
+ verify(nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0);
+ verify(nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0);
+
+ path = zpool_draid_name(buf, sizeof (buf), ndata,
+ nparity, nspares, children);
+ }
+
+ /*
* We identify each top-level vdev by using a <type-id>
* naming convention.
*/
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index d427bda36..5b938bd4a 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -124,6 +124,8 @@ KERNEL_C = \
unique.c \
vdev.c \
vdev_cache.c \
+ vdev_draid.c \
+ vdev_draid_rand.c \
vdev_file.c \
vdev_indirect_births.c \
vdev_indirect.c \
@@ -216,7 +218,7 @@ libzpool_la_LIBADD = \
$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
$(abs_top_builddir)/lib/libzstd/libzstd.la
-libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl
+libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm
libzpool_la_LDFLAGS = -pthread