summaryrefslogtreecommitdiffstats
path: root/module/zfs/spa.c
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2020-11-13 13:51:51 -0800
committerGitHub <[email protected]>2020-11-13 13:51:51 -0800
commitb2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch)
tree6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /module/zfs/spa.c
parenta724db03740133c46b9a577b41a6f7221acd3e1f (diff)
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create <pool> draid[1,2,3] <vdevs...> Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create <pool> \ draid[<parity>][:<data>d][:<children>c][:<spares>s] \ <vdevs...> - draid[parity] - Parity level (default 1) - draid[:<data>d] - Data devices per group (default 8) - draid[:<children>c] - Expected number of child vdevs - draid[:<spares>s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D <value> - dRAID data drives per group -S <value> - dRAID distributed hot spares -R <value> - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Don Brady <[email protected]> Co-authored-by: Matthew Ahrens <[email protected]> Co-authored-by: Brian Behlendorf <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #10102
Diffstat (limited to 'module/zfs/spa.c')
-rw-r--r--module/zfs/spa.c126
1 files changed, 94 insertions, 32 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 9d1d4e0cc..ae8964e6f 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -60,6 +60,7 @@
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
+#include <sys/vdev_draid.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/mmp.h>
@@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
/*
* Build a new vdev tree from the trusted config
*/
- VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
+ if (error != 0) {
+ nvlist_free(mos_config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_load_failed(spa, "spa_config_parse failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
/*
* Vdev paths in the MOS may be obsolete. If the untrusted config was
@@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version, obj;
+ uint64_t version, obj, ndraid = 0;
boolean_t has_features;
boolean_t has_encryption;
boolean_t has_allocclass;
@@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
- (error = spa_validate_aux(spa, nvroot, txg,
- VDEV_ALLOC_ADD)) == 0) {
+ (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
+ (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
/*
* instantiate the metaslab groups (this will dirty the vdevs)
* we can no longer error exit past this point
@@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_sync_props(props, tx);
}
+ for (int i = 0; i < ndraid; i++)
+ spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
@@ -6404,12 +6415,25 @@ spa_reset(const char *pool)
*/
/*
+ * This is called as a synctask to increment the draid feature flag
+ */
+static void
+spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ int draid = (int)(uintptr_t)arg;
+
+ for (int c = 0; c < draid; c++)
+ spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+}
+
+/*
* Add a device to a storage pool.
*/
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
- uint64_t txg;
+ uint64_t txg, ndraid = 0;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
@@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
if (vd->vdev_children != 0 &&
- (error = vdev_create(vd, txg, B_FALSE)) != 0)
+ (error = vdev_create(vd, txg, B_FALSE)) != 0) {
return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ /*
+ * The virtual dRAID spares must be added after vdev tree is created
+ * and the vdev guids are generated. The guid of their assoicated
+ * dRAID is stored in the config and used when opening the spare.
+ */
+ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
+ rvd->vdev_children)) == 0) {
+ if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
+ nspares = 0;
+ } else {
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
/*
* We must validate the spares and l2cache devices after checking the
@@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* If we are in the middle of a device removal, we can only add
* devices which match the existing devices in the pool.
* If we are in the middle of a removal, or have some indirect
- * vdevs, we can not add raidz toplevels.
+ * vdevs, we can not add raidz or dRAID top levels.
*/
if (spa->spa_vdev_removal != NULL ||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
@@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
- /* Fail if top level vdev is raidz */
- if (tvd->vdev_ops == &vdev_raidz_ops) {
+ /* Fail if top level vdev is raidz or a dRAID */
+ if (vdev_get_nparity(tvd) != 0)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
- }
+
/*
* Need the top level mirror to be
* a mirror of leaf vdevs only
@@ -6506,6 +6545,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
/*
+ * We can't increment a feature while holding spa_vdev so we
+ * have to do it in a synctask.
+ */
+ if (ndraid != 0) {
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
+ (void *)(uintptr_t)ndraid, tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
* We have to be careful when adding new vdevs to an existing pool.
* If other threads start allocating from these vdevs before we
* sync the config cache, and we lose power, then upon reboot we may
@@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ /*
+ * A dRAID spare can only replace a child of its parent dRAID vdev.
+ */
+ if (newvd->vdev_ops == &vdev_draid_spare_ops &&
+ oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+
if (rebuild) {
/*
- * For rebuilds, the parent vdev must support reconstruction
+ * For rebuilds, the top vdev must support reconstruction
* using only space maps. This means the only allowable
- * parents are the root vdev or a mirror vdev.
+ * vdevs types are the root vdev, a mirror, or dRAID.
*/
- if (pvd->vdev_ops != &vdev_mirror_ops &&
- pvd->vdev_ops != &vdev_root_ops) {
+ tvd = pvd;
+ if (pvd->vdev_top != NULL)
+ tvd = pvd->vdev_top;
+
+ if (tvd->vdev_ops != &vdev_mirror_ops &&
+ tvd->vdev_ops != &vdev_root_ops &&
+ tvd->vdev_ops != &vdev_draid_ops) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
}
@@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
}
/*
- * If we are detaching the original disk from a spare, then it implies
- * that the spare should become a real disk, and be removed from the
- * active spare list for the pool.
+ * If we are detaching the original disk from a normal spare, then it
+ * implies that the spare should become a real disk, and be removed
+ * from the active spare list for the pool. dRAID spares on the
+ * other hand are coupled to the pool and thus should never be removed
+ * from the spares list.
*/
- if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0 &&
- pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
- unspare = B_TRUE;
+ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
+ vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+ if (last_cvd->vdev_isspare &&
+ last_cvd->vdev_ops != &vdev_draid_spare_ops) {
+ unspare = B_TRUE;
+ }
+ }
/*
* Erase the disk labels so the disk can be used for other things.
@@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
/*
* If any devices are done replacing, detach them.
*/
- if (tasks & SPA_ASYNC_RESILVER_DONE)
+ if (tasks & SPA_ASYNC_RESILVER_DONE ||
+ tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
-
- /*
- * If any devices are done replacing, detach them. Then if no
- * top-level vdevs are rebuilding attempt to kick off a scrub.
- */
- if (tasks & SPA_ASYNC_REBUILD_DONE) {
- spa_vdev_resilver_done(spa);
-
- if (!vdev_rebuild_active(spa->spa_root_vdev))
- (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
}
/*