diff options
author | Brian Behlendorf <[email protected]> | 2020-11-13 13:51:51 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-11-13 13:51:51 -0800 |
commit | b2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch) | |
tree | 6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /module/zfs/spa.c | |
parent | a724db03740133c46b9a577b41a6f7221acd3e1f (diff) |
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Co-authored-by: Matthew Ahrens <[email protected]>
Co-authored-by: Brian Behlendorf <[email protected]>
Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10102
Diffstat (limited to 'module/zfs/spa.c')
-rw-r--r-- | module/zfs/spa.c | 126 |
1 files changed, 94 insertions, 32 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9d1d4e0cc..ae8964e6f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -60,6 +60,7 @@ #include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> +#include <sys/vdev_draid.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> #include <sys/mmp.h> @@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, /* * Build a new vdev tree from the trusted config */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); + if (error != 0) { + nvlist_free(mos_config); + spa_config_exit(spa, SCL_ALL, FTAG); + spa_load_failed(spa, "spa_config_parse failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was @@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; @@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point @@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < ndraid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -6404,12 +6415,25 @@ spa_reset(const char *pool) */ /* + * This is called as a synctask to increment the draid feature flag + */ +static void +spa_draid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int draid = (int)(uintptr_t)arg; + + for (int c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); +} + +/* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; + uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) + (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); + } + + /* + * The virtual dRAID spares must be added after vdev tree is created + * and the vdev guids are generated. The guid of their assoicated + * dRAID is stored in the config and used when opening the spare. + */ + if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, + rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) + nspares = 0; + } else { + return (spa_vdev_exit(spa, vd, txg, error)); + } /* * We must validate the spares and l2cache devices after checking the @@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { @@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + /* Fail if top level vdev is raidz or a dRAID */ + if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } + /* * Need the top level mirror to be * a mirror of leaf vdevs only @@ -6506,6 +6545,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* + * We can't increment a feature while holding spa_vdev so we + * have to do it in a synctask. + */ + if (ndraid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, + (void *)(uintptr_t)ndraid, tx); + dmu_tx_commit(tx); + } + + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may @@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * A dRAID spare can only replace a child of its parent dRAID vdev. + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + if (rebuild) { /* - * For rebuilds, the parent vdev must support reconstruction + * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable - * parents are the root vdev or a mirror vdev. + * vdevs types are the root vdev, a mirror, or dRAID. */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) { + tvd = pvd; + if (pvd->vdev_top != NULL) + tvd = pvd->vdev_top; + + if (tvd->vdev_ops != &vdev_mirror_ops && + tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } @@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. + * If we are detaching the original disk from a normal spare, then it + * implies that the spare should become a real disk, and be removed + * from the active spare list for the pool. dRAID spares on the + * other hand are coupled to the pool and thus should never be removed + * from the spares list. */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; + if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { + vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + if (last_cvd->vdev_isspare && + last_cvd->vdev_ops != &vdev_draid_spare_ops) { + unspare = B_TRUE; + } + } /* * Erase the disk labels so the disk can be used for other things. @@ -8013,18 +8084,9 @@ spa_async_thread(void *arg) /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_RESILVER_DONE) + if (tasks & SPA_ASYNC_RESILVER_DONE || + tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); - - /* - * If any devices are done replacing, detach them. Then if no - * top-level vdevs are rebuilding attempt to kick off a scrub. - */ - if (tasks & SPA_ASYNC_REBUILD_DONE) { - spa_vdev_resilver_done(spa); - - if (!vdev_rebuild_active(spa->spa_root_vdev)) - (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* |