From b2255edcc0099e62ad46a3dd9d64537663c6aee3 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 13 Nov 2020 13:51:51 -0800 Subject: Distributed Spare (dRAID) Feature This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create draid[1,2,3] Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create \ draid[][:d][:c][:s] \ - draid[parity] - Parity level (default 1) - draid[:d] - Data devices per group (default 8) - draid[:c] - Expected number of child vdevs - draid[:s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D - dRAID data drives per group -S - dRAID distributed hot spares -R - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang Co-authored-by: Mark Maybee Co-authored-by: Don Brady Co-authored-by: Matthew Ahrens Co-authored-by: Brian Behlendorf Reviewed-by: Mark Maybee Reviewed-by: Matt Ahrens Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #10102 --- lib/libzfs/libzfs_dataset.c | 93 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 19 deletions(-) (limited to 'lib/libzfs/libzfs_dataset.c') diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 1eaed435c..47418b323 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5336,6 +5336,16 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * + * The situtation is slightly different for dRAID since the minimum allocation + * size is the full group width. The same 8K block above would be written as + * follows in a dRAID group: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | S1 | + * +-------+-------+-------+-------+-------+ + * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. @@ -5365,6 +5375,23 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, return (asize); } +/* + * Derived from function of same name in module/zfs/vdev_draid.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. + */ +static uint64_t +vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + ASSERT3U(ndisks, >, nparity); + uint64_t ndata = ndisks - nparity; + uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; + uint64_t asize = (rows * ndisks) << ashift; + + return (asize); +} + /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one @@ -5374,7 +5401,7 @@ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; - uint_t nvdevs, v; + uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); @@ -5384,33 +5411,61 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) return (nblocks * blksize); } - for (v = 0; v < nvdevs; v++) { + for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; - nvlist_t **disks; - uint_t ndisks; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, - &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, - &nparity) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, - &ashift) != 0 || - nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, - &disks, &ndisks) != 0) { + &type) != 0) + continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && + strcmp(type, VDEV_TYPE_DRAID) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_NPARITY, &nparity) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; - } - /* allocation size for the "typical" 128k block */ - tsize = vdev_raidz_asize(ndisks, nparity, ashift, - SPA_OLD_MAXBLOCKSIZE); - /* allocation size for the blksize block */ - asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + nvlist_t **disks; + uint_t ndisks; + + if (nvlist_lookup_nvlist_array(vdevs[v], + ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, + blksize); + } else { + uint64_t ndata; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_draid_asize(ndata + nparity, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_draid_asize(ndata + nparity, nparity, + ashift, blksize); + } /* - * Scale this size down as a ratio of 128k / tsize. See theory - * statement above. + * Scale this size down as a ratio of 128k / tsize. + * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { -- cgit v1.2.3