diff options
author | Brian Behlendorf <[email protected]> | 2020-11-13 13:51:51 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-11-13 13:51:51 -0800 |
commit | b2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch) | |
tree | 6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /module | |
parent | a724db03740133c46b9a577b41a6f7221acd3e1f (diff) |
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Co-authored-by: Matthew Ahrens <[email protected]>
Co-authored-by: Brian Behlendorf <[email protected]>
Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10102
Diffstat (limited to 'module')
33 files changed, 5357 insertions, 1388 deletions
diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 4a2514fd4..1acf543ac 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -243,6 +243,8 @@ SRCS+= abd.c \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_indirect.c \ vdev_indirect_births.c \ vdev_indirect_mapping.c \ @@ -341,6 +343,7 @@ CFLAGS.lz4.c= -Wno-cast-qual CFLAGS.spa.c= -Wno-cast-qual CFLAGS.spa_misc.c= -Wno-cast-qual CFLAGS.sysctl_os.c= -include ../zfs_config.h +CFLAGS.vdev_draid.c= -Wno-cast-qual CFLAGS.vdev_raidz.c= -Wno-cast-qual CFLAGS.vdev_raidz_math.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index cf762c5fd..825bd706e 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -292,19 +292,28 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* @@ -313,19 +322,28 @@ vdev_ops_t vdev_file_ops = { #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index b888cfdf0..ae7cbe60a 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1189,17 +1189,26 @@ vdev_geom_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { - vdev_geom_open, - vdev_geom_close, - vdev_default_asize, - vdev_geom_io_start, - vdev_geom_io_done, - NULL, - NULL, - vdev_geom_hold, - vdev_geom_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_geom_open, + .vdev_op_close = vdev_geom_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_geom_io_start, + .vdev_op_io_done = vdev_geom_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_geom_hold, + .vdev_op_rele = vdev_geom_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 7de5c30f7..12117655b 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -826,9 +826,13 @@ vdev_disk_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, @@ -837,6 +841,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 423ce8581..bf8a13ae6 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -305,9 +305,13 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -316,6 +320,11 @@ vdev_ops_t vdev_file_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; @@ -341,9 +350,13 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -352,6 +365,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 97ddacbab..599791d49 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -576,7 +576,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", - "Support for sequential device rebuilds", + "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { @@ -589,6 +589,10 @@ zpool_feature_init(void) "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); } + + zfeature_register(SPA_FEATURE_DRAID, + "org.openzfs:draid", "draid", "Support for distributed parity RAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index f8625042a..0011a971c 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -442,7 +442,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 8ee524fff..653ea0da9 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o +$(MODULE)-objs += vdev_draid.o +$(MODULE)-objs += vdev_draid_rand.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 6018a42ca..68d4aa5f5 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -781,16 +781,17 @@ int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { - int ret = 0; struct abd_iter aiter; - boolean_t abd_multi; - abd_t *c_abd; + int ret = 0; + + if (size == 0) + return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_multi = abd_is_gang(abd); - c_abd = abd_init_abd_iter(abd, &aiter, off); + boolean_t abd_multi = abd_is_gang(abd); + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { /* If we are at the end of the gang ABD we are done */ @@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; + if (size == 0) + return (0); + abd_verify(dabd); abd_verify(sabd); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f6a5ceca6..40adfbcee 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) return (0); } -static void +void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; @@ -3328,19 +3328,12 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, } /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. - */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); - - /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 325f505b7..fcf1285f6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -32,6 +32,7 @@ #include <sys/space_map.h> #include <sys/metaslab_impl.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/zio.h> #include <sys/spa_impl.h> #include <sys/zfeature.h> @@ -1563,6 +1564,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) + /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking @@ -1654,6 +1656,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); + if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); @@ -2616,6 +2619,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocator = -1; ms->ms_new = B_TRUE; + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_metaslab_init != NULL) + ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); + /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. @@ -5813,7 +5820,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } - } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 99852521b..d05c9db24 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa) if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); - if (!vdev_writeable(leaf)) { + /* + * We skip unwritable, offline, detached, and dRAID spare + * devices as they are either not legal targets or the write + * may fail or not be seen by other hosts. Skipped dRAID + * spares can never be written so the fail mask is not set. + */ + if (!vdev_writeable(leaf) || leaf->vdev_offline || + leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { + continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9d1d4e0cc..ae8964e6f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -60,6 +60,7 @@ #include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> +#include <sys/vdev_draid.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> #include <sys/mmp.h> @@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, /* * Build a new vdev tree from the trusted config */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); + if (error != 0) { + nvlist_free(mos_config); + spa_config_exit(spa, SCL_ALL, FTAG); + spa_load_failed(spa, "spa_config_parse failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was @@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; @@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point @@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < ndraid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -6404,12 +6415,25 @@ spa_reset(const char *pool) */ /* + * This is called as a synctask to increment the draid feature flag + */ +static void +spa_draid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int draid = (int)(uintptr_t)arg; + + for (int c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); +} + +/* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; + uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) + (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); + } + + /* + * The virtual dRAID spares must be added after vdev tree is created + * and the vdev guids are generated. The guid of their assoicated + * dRAID is stored in the config and used when opening the spare. + */ + if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, + rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) + nspares = 0; + } else { + return (spa_vdev_exit(spa, vd, txg, error)); + } /* * We must validate the spares and l2cache devices after checking the @@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { @@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + /* Fail if top level vdev is raidz or a dRAID */ + if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } + /* * Need the top level mirror to be * a mirror of leaf vdevs only @@ -6506,6 +6545,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* + * We can't increment a feature while holding spa_vdev so we + * have to do it in a synctask. + */ + if (ndraid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, + (void *)(uintptr_t)ndraid, tx); + dmu_tx_commit(tx); + } + + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may @@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * A dRAID spare can only replace a child of its parent dRAID vdev. + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + if (rebuild) { /* - * For rebuilds, the parent vdev must support reconstruction + * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable - * parents are the root vdev or a mirror vdev. + * vdevs types are the root vdev, a mirror, or dRAID. */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) { + tvd = pvd; + if (pvd->vdev_top != NULL) + tvd = pvd->vdev_top; + + if (tvd->vdev_ops != &vdev_mirror_ops && + tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } @@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. + * If we are detaching the original disk from a normal spare, then it + * implies that the spare should become a real disk, and be removed + * from the active spare list for the pool. dRAID spares on the + * other hand are coupled to the pool and thus should never be removed + * from the spares list. */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; + if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { + vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + if (last_cvd->vdev_isspare && + last_cvd->vdev_ops != &vdev_draid_spare_ops) { + unspare = B_TRUE; + } + } /* * Erase the disk labels so the disk can be used for other things. @@ -8013,18 +8084,9 @@ spa_async_thread(void *arg) /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_RESILVER_DONE) + if (tasks & SPA_ASYNC_RESILVER_DONE || + tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); - - /* - * If any devices are done replacing, detach them. Then if no - * top-level vdevs are rebuilding attempt to kick off a scrub. - */ - if (tasks & SPA_ASYNC_REBUILD_DONE) { - spa_vdev_resilver_done(spa); - - if (!vdev_rebuild_active(spa->spa_root_vdev)) - (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 1640dcedd..c6b3e8c11 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; + spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e41e79ab8..38f36e52f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -40,6 +40,7 @@ #include <sys/dsl_dir.h> #include <sys/vdev_impl.h> #include <sys/vdev_rebuild.h> +#include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -51,6 +52,7 @@ #include <sys/arc.h> #include <sys/zil.h> #include <sys/dsl_scan.h> +#include <sys/vdev_raidz.h> #include <sys/abd.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> @@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -221,10 +225,11 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; } /* @@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) return (asize); } +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to @@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd) if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void @@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd) vdev_set_min_asize(vd->vdev_child[c]); } +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { @@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); @@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); - cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, @@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; @@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; + /* + * If creating a top-level vdev, check for allocation + * classes input. + */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); @@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); } } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_draid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + return (SET_ERROR(ENOTSUP)); + } } - vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; + /* + * Initialize the vdev specific data. This is done before calling + * vdev_alloc_common() since it may fail and this simplifies the + * error reporting and cleanup code paths. + */ + void *tsd = NULL; + if (ops->vdev_op_init != NULL) { + rc = ops->vdev_op_init(spa, nv, &tsd); + if (rc != 0) { + return (rc); + } + } + vd = vdev_alloc_common(spa, id, guid, ops); + vd->vdev_tsd = tsd; vd->vdev_islog = islog; - vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + vic = &vd->vdev_indirect_config; + ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); @@ -937,6 +967,9 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops->vdev_op_fini != NULL) + vd->vdev_ops->vdev_op_fini(vd); + /* * Discard allocation state. */ @@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); - mutex_destroy(&vd->vdev_rebuild_io_lock); cv_destroy(&vd->vdev_rebuild_cv); - cv_destroy(&vd->vdev_rebuild_io_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd) } /* - * Add a mirror/replacing vdev above an existing vdev. + * Add a mirror/replacing vdev above an existing vdev. There is no need to + * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) @@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; + + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } } } @@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd) return (B_FALSE); } -void -vdev_open_children(vdev_t *vd) +/* + * Returns B_TRUE if the passed child should be opened. + */ +static boolean_t +vdev_default_open_children_func(vdev_t *vd) +{ + return (B_TRUE); +} + +/* + * Open the requested child vdevs. If any of the leaf vdevs are using + * a ZFS volume then do the opens in a single thread. This avoids a + * deadlock when the current thread is holding the spa_namespace_lock. + */ +static void +vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { - taskq_t *tq; int children = vd->vdev_children; - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (vdev_uses_zvols(vd)) { -retry_sync: - for (int c = 0; c < children; c++) - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - } else { - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - if (tq == NULL) - goto retry_sync; + taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + vd->vdev_nonrot = B_TRUE; - for (int c = 0; c < children; c++) + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE) + continue; + + if (tq == NULL || vdev_uses_zvols(vd)) { + cvd->vdev_open_error = vdev_open(cvd); + } else { VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); + cvd, TQ_SLEEP) != TASKQID_INVALID); + } + vd->vdev_nonrot &= cvd->vdev_nonrot; + } + + if (tq != NULL) { + taskq_wait(tq); taskq_destroy(tq); } +} - vd->vdev_nonrot = B_TRUE; +/* + * Open all child vdevs. + */ +void +vdev_open_children(vdev_t *vd) +{ + vdev_open_children_impl(vd, vdev_default_open_children_func); +} - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +/* + * Conditionally open a subset of child vdevs. + */ +void +vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + vdev_open_children_impl(vd, open_func); } /* @@ -1953,6 +2017,16 @@ vdev_open(vdev_t *vd) } /* + * Track the the minimum allocation size. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + } + + /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. @@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd != NULL); + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are @@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) } /* - * Returns B_TRUE if vdev determines offset needs to be resilvered. + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ +boolean_t +vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + /* Set by sequential resilver. */ + if (phys_birth == TXG_UNKNOWN) + return (B_TRUE); + + return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); +} + +/* + * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); @@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) vd->vdev_ops->vdev_op_leaf) return (B_TRUE); - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); + return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, + phys_birth)); } /* @@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ + else if (vdev_get_nparity(vd) != 0) + minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); @@ -3727,6 +3820,9 @@ top: if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio) static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { + /* + * Exclude the dRAID spare when aggregating to avoid double counting + * the ops and bytes. These IOs are counted by the physical leaves. + */ + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return; + for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; @@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); - } } else { /* @@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * Repair is the result of a rebuild issued by the - * rebuild thread (vdev_rebuild_thread). + * rebuild thread (vdev_rebuild_thread). To avoid + * double counting repaired bytes the virtual dRAID + * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; @@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); + } vs->vs_rebuild_processed += psize; } @@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) vdev_resilver_needed(vd, NULL, NULL)); } +boolean_t +vdev_xlate_is_empty(range_seg64_t *rs) +{ + return (rs->rs_start == rs->rs_end); +} + /* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. + * Translate a logical range to the first contiguous physical range for the + * specified vdev_t. This function is initially called with a leaf vdev and + * will walk each parent vdev until it reaches a top-level vdev. Once the + * top-level is reached the physical range is initialized and the recursive + * function begins to unwind. As it unwinds it calls the parent's vdev + * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs) + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, + remain_rs); } else { /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. + * We've reached the top-level vdev, initialize the physical + * range to the logical range and set an empty remaining + * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; + + remain_rs->rs_start = logical_rs->rs_start; + remain_rs->rs_end = logical_rs->rs_start; + return; } @@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, /* * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. + * range into its physical and any remaining components by calling + * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } +void +vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg) +{ + range_seg64_t iter_rs = *logical_rs; + range_seg64_t physical_rs; + range_seg64_t remain_rs; + + while (!vdev_xlate_is_empty(&iter_rs)) { + + vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); + + /* + * With raidz and dRAID, it's possible that the logical range + * does not live on this leaf vdev. Only when there is a non- + * zero physical size call the provided function. + */ + if (!vdev_xlate_is_empty(&physical_rs)) + func(arg, &physical_rs); + + iter_rs = remain_rs; + } +} + /* * Look at the vdev tree and determine whether any devices are currently being * replaced. diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c new file mode 100644 index 000000000..6b7ad7021 --- /dev/null +++ b/module/zfs/vdev_draid.c @@ -0,0 +1,2984 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_rebuild.h> +#include <sys/abd.h> +#include <sys/zio.h> +#include <sys/nvpair.h> +#include <sys/zio_checksum.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> +#include <zfs_fletcher.h> + +#ifdef ZFS_DEBUG +#include <sys/vdev.h> /* For vdev_xlate() in vdev_draid_io_verify() */ +#endif + +/* + * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is + * comprised of multiple raidz redundancy groups which are spread over the + * dRAID children. To ensure an even distribution, and avoid hot spots, a + * permutation mapping is applied to the order of the dRAID children. + * This mixing effectively distributes the parity columns evenly over all + * of the disks in the dRAID. + * + * This is beneficial because it means when resilvering all of the disks + * can participate thereby increasing the available IOPs and bandwidth. + * Furthermore, by reserving a small fraction of each child's total capacity + * virtual distributed spare disks can be created. These spares similarly + * benefit from the performance gains of spanning all of the children. The + * consequence of which is that resilvering to a distributed spare can + * substantially reduce the time required to restore full parity to pool + * with a failed disks. + * + * === dRAID group layout === + * + * First, let's define a "row" in the configuration to be a 16M chunk from + * each physical drive at the same offset. This is the minimum allowable + * size since it must be possible to store a full 16M block when there is + * only a single data column. Next, we define a "group" to be a set of + * sequential disks containing both the parity and data columns. We allow + * groups to span multiple rows in order to align any group size to any + * number of physical drives. Finally, a "slice" is comprised of the rows + * which contain the target number of groups. The permutation mappings + * are applied in a round robin fashion to each slice. + * + * Given D+P drives in a group (including parity drives) and C-S physical + * drives (not including the spare drives), we can distribute the groups + * across R rows without remainder by selecting the least common multiple + * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). + * + * In the example below, there are C=14 physical drives in the configuration + * with S=2 drives worth of spare capacity. Each group has a width of 9 + * which includes D=8 data and P=1 parity drive. There are 4 groups and + * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice + * size is 576M (144M * 4). When allocating from a dRAID each group is + * filled before moving on to the next as show in slice0 below. + * + * data disks (8 data + 1 parity) spares (2) + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 + * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | | group 0 | group 1..| | + * | +-----------------------------------+-----------+-------| + * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r + * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o + * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w + * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 + * s +-----------------------+-----------------------+-------+ + * l | ..group 1 | group 2.. | | + * i +-----------------------+-----------------------+-------+ + * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r + * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o + * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w + * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 + * | +-----------+-----------+-----------------------+-------+ + * | |..group 2 | group 3 | | + * | +-----------+-----------+-----------------------+-------+ + * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r + * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o + * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w + * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 4 | group 5..| | row 3 + * i +-----------------------+-----------+-----------+-------| + * c | ..group 5 | group 6.. | | row 4 + * e +-----------+-----------+-----------------------+-------+ + * 1 |..group 6 | group 7 | | row 5 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 8 | group 9..| | row 6 + * i +-----------------------------------------------+-------| + * c | ..group 9 | group 10.. | | row 7 + * e +-----------------------+-----------------------+-------+ + * 2 |..group 10 | group 11 | | row 8 + * +-----------+-----------------------------------+-------+ + * + * This layout has several advantages over requiring that each row contain + * a whole number of groups. + * + * 1. The group count is not a relevant parameter when defining a dRAID + * layout. Only the group width is needed, and *all* groups will have + * the desired size. + * + * 2. All possible group widths (<= physical disk count) can be supported. + * + * 3. The logic within vdev_draid.c is simplified when the group width is + * the same for all groups (although some of the logic around computing + * permutation numbers and drive offsets is more complicated). + * + * N.B. The following array describes all valid dRAID permutation maps. + * Each row is used to generate a permutation map for a different number + * of children from a unique seed. The seeds were generated and carefully + * evaluated by the 'draid' utility in order to provide balanced mappings. + * In addition to the seed a checksum of the in-memory mapping is stored + * for verification. + * + * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, + * with a given permutation map) is the ratio of the amounts of I/O that will + * be sent to the least and most busy disks when resilvering. The average + * imbalance ratio (of a given number of disks and permutation map) is the + * average of the ratios of all possible single and double disk failures. + * + * In order to achieve a low imbalance ratio the number of permutations in + * the mapping must be significantly larger than the number of children. + * For dRAID the number of permutations has been limited to 512 to minimize + * the map size. This does result in a gradually increasing imbalance ratio + * as seen in the table below. Increasing the number of permutations for + * larger child counts would reduce the imbalance ratio. However, in practice + * when there are a large number of children each child is responsible for + * fewer total IOs so it's less of a concern. + * + * Note these values are hard coded and must never be changed. Existing + * pools depend on the same mapping always being generated in order to + * read and write from the correct locations. Any change would make + * existing pools completely inaccessible. + */ +static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { + { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ + { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ + { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ + { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ + { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ + { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ + { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ + { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ + { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ + { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ + { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ + { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ + { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ + { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ + { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ + { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ + { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ + { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ + { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ + { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ + { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ + { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ + { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ + { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ + { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ + { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ + { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ + { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ + { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ + { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ + { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ + { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ + { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ + { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ + { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ + { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ + { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ + { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ + { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ + { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ + { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ + { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ + { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ + { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ + { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ + { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ + { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ + { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ + { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ + { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ + { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ + { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ + { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ + { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ + { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ + { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ + { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ + { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ + { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ + { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ + { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ + { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ + { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ + { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ + { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ + { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ + { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ + { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ + { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ + { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ + { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ + { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ + { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ + { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ + { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ + { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ + { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ + { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ + { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ + { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ + { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ + { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ + { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ + { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ + { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ + { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ + { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ + { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ + { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ + { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ + { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ + { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ + { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ + { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ + { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ + { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ + { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ + { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ + { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ + { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ + { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ + { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ + { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ + { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ + { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ + { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ + { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ + { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ + { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ + { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ + { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ + { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ + { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ + { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ + { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ + { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ + { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ + { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ + { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ + { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ + { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ + { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ + { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ + { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ + { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ + { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ + { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ + { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ + { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ + { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ + { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ + { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ + { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ + { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ + { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ + { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ + { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ + { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ + { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ + { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ + { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ + { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ + { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ + { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ + { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ + { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ + { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ + { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ + { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ + { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ + { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ + { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ + { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ + { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ + { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ + { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ + { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ + { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ + { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ + { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ + { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ + { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ + { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ + { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ + { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ + { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ + { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ + { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ + { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ + { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ + { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ + { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ + { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ + { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ + { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ + { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ + { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ + { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ + { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ + { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ + { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ + { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ + { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ + { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ + { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ + { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ + { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ + { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ + { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ + { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ + { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ + { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ + { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ + { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ + { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ + { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ + { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ + { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ + { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ + { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ + { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ + { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ + { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ + { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ + { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ + { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ + { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ + { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ + { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ + { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ + { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ + { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ + { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ + { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ + { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ + { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ + { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ + { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ + { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ + { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ + { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ + { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ + { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ + { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ + { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ + { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ + { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ + { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ + { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ + { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ + { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ + { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ + { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ + { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ + { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ + { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ + { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ + { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ + { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ + { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ + { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ + { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ + { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ + { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ + { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ + { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ + { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ + { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ + { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ + { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ + { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ + { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ + { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ + { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ +}; + +/* + * Verify the map is valid. Each device index must appear exactly + * once in every row, and the permutation array checksum must match. + */ +static int +verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, + uint64_t checksum) +{ + int countssz = sizeof (uint16_t) * children; + uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); + + for (int i = 0; i < nperms; i++) { + for (int j = 0; j < children; j++) { + uint8_t val = perms[(i * children) + j]; + + if (val >= children || counts[val] != i) { + kmem_free(counts, countssz); + return (EINVAL); + } + + counts[val]++; + } + } + + if (checksum != 0) { + int permssz = sizeof (uint8_t) * children * nperms; + zio_cksum_t cksum; + + fletcher_4_native_varsize(perms, permssz, &cksum); + + if (checksum != cksum.zc_word[0]) { + kmem_free(counts, countssz); + return (ECKSUM); + } + } + + kmem_free(counts, countssz); + + return (0); +} + +/* + * Generate the permutation array for the draid_map_t. These maps control + * the placement of all data in a dRAID. Therefore it's critical that the + * seed always generates the same mapping. We provide our own pseudo-random + * number generator for this purpose. + */ +int +vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) +{ + VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); + VERIFY3U(map->dm_seed, !=, 0); + VERIFY3U(map->dm_nperms, !=, 0); + VERIFY3P(map->dm_perms, ==, NULL); + +#ifdef _KERNEL + /* + * The kernel code always provides both a map_seed and checksum. + * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide + * a zero checksum when generating new candidate maps. + */ + VERIFY3U(map->dm_checksum, !=, 0); +#endif + uint64_t children = map->dm_children; + uint64_t nperms = map->dm_nperms; + int rowsz = sizeof (uint8_t) * children; + int permssz = rowsz * nperms; + uint8_t *perms; + + /* Allocate the permutation array */ + perms = vmem_alloc(permssz, KM_SLEEP); + + /* Setup an initial row with a known pattern */ + uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); + for (int i = 0; i < children; i++) + initial_row[i] = i; + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + uint8_t *current_row, *previous_row = initial_row; + + /* + * Perform a Fisher-Yates shuffle of each row using the previous + * row as the starting point. An initial_row with known pattern + * is used as the input for the first row. + */ + for (int i = 0; i < nperms; i++) { + current_row = &perms[i * children]; + memcpy(current_row, previous_row, rowsz); + + for (int j = children - 1; j > 0; j--) { + uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); + uint8_t val = current_row[j]; + current_row[j] = current_row[k]; + current_row[k] = val; + } + + previous_row = current_row; + } + + kmem_free(initial_row, rowsz); + + int error = verify_perms(perms, children, nperms, map->dm_checksum); + if (error) { + vmem_free(perms, permssz); + return (error); + } + + *permsp = perms; + + return (0); +} + +/* + * Lookup the fixed draid_map_t for the requested number of children. + */ +int +vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) +{ + for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + if (draid_maps[i].dm_children == children) { + *mapp = &draid_maps[i]; + return (0); + } + } + + return (ENOENT); +} + +/* + * Lookup the permutation array and iteration id for the provided offset. + */ +static void +vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, + uint8_t **base, uint64_t *iter) +{ + uint64_t ncols = vdc->vdc_children; + uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + + *base = vdc->vdc_perms + (poff / ncols) * ncols; + *iter = poff % ncols; +} + +static inline uint64_t +vdev_draid_permute_id(vdev_draid_config_t *vdc, + uint8_t *base, uint64_t iter, uint64_t index) +{ + return ((base[index] + iter) % vdc->vdc_children); +} + +/* + * Return the asize which is the psize rounded up to a full group width. + * i.e. vdev_draid_psize_to_asize(). + */ +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_ashift; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; + uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; + + ASSERT3U(asize, !=, 0); + ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); + + return (asize); +} + +/* + * Deflate the asize to the psize, this includes stripping parity. + */ +uint64_t +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT0(asize % vdc->vdc_groupwidth); + + return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); +} + +/* + * Convert a logical offset to the corresponding group number. + */ +static uint64_t +vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (offset / vdc->vdc_groupsz); +} + +/* + * Convert a group number to the logical starting offset for that group. + */ +static uint64_t +vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (group * vdc->vdc_groupsz); +} + + +static void +vdev_draid_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT0(rm->rm_freed); + rm->rm_freed = B_TRUE; + + if (rm->rm_reports == 0) { + vdev_raidz_map_free(rm); + } +} + +/*ARGSUSED*/ +static void +vdev_draid_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed) + vdev_raidz_map_free(rm); +} + +static void +vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + const size_t c = zcr->zcr_cbinfo; + uint64_t skip_size = zcr->zcr_sector; + uint64_t parity_size; + size_t x, offset, size; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + /* + * Detailed cksum reporting is currently only supported for single + * row draid mappings, this covers the vast majority of zios. Only + * a dRAID zio which spans groups will have multiple rows. + */ + if (rm->rm_nrows != 1) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + raidz_row_t *rr = rm->rm_row[0]; + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical zio) + */ + if (rr->rr_col[0].rc_gdata == NULL) { + abd_t *bad_parity[VDEV_DRAID_MAXPARITY]; + + /* + * Set up the rr_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); + } + + /* + * Fill in the data columns from good_data being + * careful to pad short columns and empty columns + * with a skip sector. + */ + uint64_t good_size = abd_get_size((abd_t *)good_data); + + offset = 0; + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + + if (offset == good_size) { + /* empty data column (small write) */ + rr->rr_col[x].rc_abd = + abd_get_zeros(skip_size); + } else if (x < rr->rr_bigcols) { + /* this is a "big column" */ + size = rr->rr_col[x].rc_size; + rr->rr_col[x].rc_abd = + abd_get_offset_size( + (abd_t *)good_data, offset, size); + offset += size; + } else { + /* short data column, add skip sector */ + size = rr->rr_col[x].rc_size -skip_size; + rr->rr_col[x].rc_abd = abd_alloc( + rr->rr_col[x].rc_size, B_TRUE); + abd_copy_off(rr->rr_col[x].rc_abd, + (abd_t *)good_data, 0, offset, + size); + abd_zero_off(rr->rr_col[x].rc_abd, + size, skip_size); + offset += size; + } + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity_row(rm, rr); + + /* restore everything back to its original state */ + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; + + offset = 0; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + if (offset == good_size || x < rr->rr_bigcols) + abd_put(rr->rr_col[x].rc_abd); + else + abd_free(rr->rr_col[x].rc_abd); + + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; + } + } + + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); + } else { + /* adjust good_data to point at the start of our column */ + parity_size = size = rr->rr_col[0].rc_size; + if (c >= rr->rr_bigcols) { + size -= skip_size; + zcr->zcr_length = size; + } + + /* empty column */ + if (size == 0) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE); + return; + } + + offset = 0; + for (x = rr->rr_firstdatacol; x < c; x++) { + if (x < rr->rr_bigcols) { + offset += parity_size; + } else { + offset += parity_size - skip_size; + } + } + + good = abd_get_offset_size((abd_t *)good_data, offset, size); + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_put((abd_t *)good); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_draid_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + raidz_map_t *rm = zio->io_vsd; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_draid_cksum_finish; + zcr->zcr_free = vdev_draid_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_row[0]->rr_abd_copy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. Furthermore, all columns should have been expanded + * by vdev_draid_map_alloc_empty() when attempting reconstruction. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + ASSERT3U(rr->rr_col[c].rc_size, ==, + rr->rr_col[0].rc_size); + size += rr->rr_col[c].rc_size; + } + + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); + + abd_copy(tmp, col->rc_abd, col->rc_size); + + if (abd_is_gang(col->rc_abd)) + abd_free(col->rc_abd); + else + abd_put(col->rc_abd); + + col->rc_abd = tmp; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); + } +} + +const zio_vsd_ops_t vdev_draid_vsd_ops = { + .vsd_free = vdev_draid_map_free_vsd, + .vsd_cksum_report = vdev_draid_cksum_report +}; + +/* + * Full stripe writes. When writing, all columns (D+P) are required. Parity + * is calculated over all the columns, including empty zero filled sectors, + * and each is written to disk. While only the data columns are needed for + * a normal read, all of the columns are required for reconstruction when + * performing a sequential resilver. + * + * For "big columns" it's sufficient to map the correct range of the zio ABD. + * Partial columns require allocating a gang ABD in order to zero fill the + * empty sectors. When the column is empty a zero filled sector must be + * mapped. In all cases the data ABDs must be the same size as the parity + * ABDs (e.g. rc->rc_size == parity_size). + */ +static void +vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small write), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + rc->rc_abd = abd_get_zeros(skip_size); + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), + B_TRUE); + } + + ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); + + abd_off += rc->rc_size; + rc->rc_size = parity_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Scrub/resilver reads. In order to store the contents of the skip sectors + * an additional ABD is allocated. The columns are handled in the same way + * as a full stripe write except instead of using the zero ABD the newly + * allocated skip ABD is used to back the skip sectors. In all cases the + * data ABD must be the same size as the parity ABDs. + */ +static void +vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + uint64_t abd_size = abd_get_size(rc->rc_abd); + ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + /* + * Increase rc_size so the skip ABD is included in subsequent + * parity calculations. + */ + abd_off += rc->rc_size; + rc->rc_size = abd_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Normal reads. In this common case only the columns containing data + * are read in to the zio ABDs. Neither the parity columns or empty skip + * sectors are read unless the checksum fails verification. In which case + * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand + * the raid map in order to allow reconstruction using the parity data and + * skip sectors. + */ +static void +vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size > 0) { + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + abd_off += rc->rc_size; + } + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key + * difference is that an ABD is allocated to back skip sectors so they may + * be read in to memory, verified, and repaired if needed. + */ +void +vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, ==, NULL); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column", nothing to add */ + ASSERT3P(rc->rc_abd, !=, NULL); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT(!abd_is_gang(rc->rc_abd)); + abd_t *read_abd = rc->rc_abd; + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, read_abd, B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + /* + * Increase rc_size so the empty ABD is included in subsequent + * parity calculations. + */ + rc->rc_size = parity_size; + } + + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Given a logical address within a dRAID configuration, return the physical + * address on the first drive in the group that this address maps to + * (at position 'start' in permutation number 'perm'). + */ +static uint64_t +vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, + uint64_t *perm, uint64_t *start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + /* b is the dRAID (parent) sector offset. */ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t b_offset = logical_offset >> ashift; + + /* + * The height of a row in units of the vdev's minimum sector size. + * This is the amount of data written to each disk of each group + * in a given permutation. + */ + uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; + + /* + * We cycle through a disk permutation every groupsz * ngroups chunk + * of address space. Note that ngroups * groupsz must be a multiple + * of the number of data drives (ndisks) in order to guarantee + * alignment. So, for example, if our row height is 16MB, our group + * size is 10, and there are 13 data drives in the draid, then ngroups + * will be 13, we will change permutation every 2.08GB and each + * disk will have 160MB of data per chunk. + */ + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t ngroups = vdc->vdc_ngroups; + uint64_t ndisks = vdc->vdc_ndisks; + + /* + * groupstart is where the group this IO will land in "starts" in + * the permutation array. + */ + uint64_t group = logical_offset / vdc->vdc_groupsz; + uint64_t groupstart = (group * groupwidth) % ndisks; + ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); + *start = groupstart; + + /* b_offset is the sector offset within a group chunk */ + b_offset = b_offset % (rowheight_sectors * groupwidth); + ASSERT0(b_offset % groupwidth); + + /* + * Find the starting byte offset on each child vdev: + * - within a permutation there are ngroups groups spread over the + * rows, where each row covers a slice portion of the disk + * - each permutation has (groupwidth * ngroups) / ndisks rows + * - so each permutation covers rows * slice portion of the disk + * - so we need to find the row where this IO group target begins + */ + *perm = group / ngroups; + uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + (((group % ngroups) * groupwidth) / ndisks); + + return (((rowheight_sectors * row) + + (b_offset / groupwidth)) << ashift); +} + +static uint64_t +vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, + uint64_t abd_offset, uint64_t abd_size) +{ + vdev_t *vd = zio->io_vd; + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t io_size = abd_size; + uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t group = vdev_draid_offset_to_group(vd, io_offset); + uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); + + /* + * Limit the io_size to the space remaining in the group. A second + * row in the raidz_map_t is created for the remainder. + */ + if (io_offset + io_asize > start_offset) { + io_size = vdev_draid_asize_to_psize(vd, + start_offset - io_offset); + } + + /* + * At most a block may span the logical end of one group and the start + * of the next group. Therefore, at the end of a group the io_size must + * span the group width evenly and the remainder must be aligned to the + * start of the next group. + */ + IMPLY(abd_offset == 0 && io_size < zio->io_size, + (io_asize >> ashift) % vdc->vdc_groupwidth == 0); + IMPLY(abd_offset != 0, + vdev_draid_group_to_offset(vd, group) == io_offset); + + /* Lookup starting byte offset on each child vdev */ + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + io_offset, &perm, &groupstart); + + /* + * If there is less than groupwidth drives available after the group + * start, the group is going to wrap onto the next row. 'wrap' is the + * group disk number that starts on the next row. + */ + uint64_t ndisks = vdc->vdc_ndisks; + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t wrap = groupwidth; + + if (groupstart + groupwidth > ndisks) + wrap = ndisks - groupstart; + + /* The io size in units of the vdev's minimum sector size. */ + const uint64_t psize = io_size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + uint64_t q = psize / vdc->vdc_ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = psize - q * vdc->vdc_ndata; + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); + ASSERT3U(bc, <, groupwidth); + + /* The total number of data and parity sectors for this I/O. */ + uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); + + raidz_row_t *rr; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); + rr->rr_cols = groupwidth; + rr->rr_scols = groupwidth; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = vdc->vdc_nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; +#ifdef ZFS_DEBUG + rr->rr_offset = io_offset; + rr->rr_size = io_size; +#endif + *rrp = rr; + + uint8_t *base; + uint64_t iter, asize = 0; + vdev_draid_get_perm(vdc, perm, &base, &iter); + for (uint64_t i = 0; i < groupwidth; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + uint64_t c = (groupstart + i) % ndisks; + + /* increment the offset if we wrap to the next row */ + if (i == wrap) + physical_offset += VDEV_DRAID_ROWHEIGHT; + + rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); + rc->rc_offset = physical_offset; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; + + if (q == 0 && i >= bc) + rc->rc_size = 0; + else if (i < bc) + rc->rc_size = (q + 1) << ashift; + else + rc->rc_size = q << ashift; + + asize += rc->rc_size; + } + + ASSERT3U(asize, ==, tot << ashift); + rr->rr_nempty = roundup(tot, groupwidth) - tot; + IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); + + /* Allocate buffers for the parity columns */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + } + + /* + * Map buffers for data columns and allocate/map buffers for skip + * sectors. There are three distinct cases for dRAID which are + * required to support sequential rebuild. + */ + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_draid_map_alloc_write(zio, abd_offset, rr); + } else if ((rr->rr_nempty > 0) && + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + vdev_draid_map_alloc_scrub(zio, abd_offset, rr); + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + vdev_draid_map_alloc_read(zio, abd_offset, rr); + } + + return (io_size); +} + +/* + * Allocate the raidz mapping to be applied to the dRAID I/O. The parity + * calculations for dRAID are identical to raidz however there are a few + * differences in the layout. + * + * - dRAID always allocates a full stripe width. Any extra sectors due + * this padding are zero filled and written to disk. They will be read + * back during a scrub or repair operation since they are included in + * the parity calculation. This property enables sequential resilvering. + * + * - When the block at the logical offset spans redundancy groups then two + * rows are allocated in the raidz_map_t. One row resides at the end of + * the first group and the other at the start of the following group. + */ +static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio) +{ + raidz_row_t *rr[2]; + uint64_t abd_offset = 0; + uint64_t abd_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + uint64_t size; + int nrows = 1; + + size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, + abd_offset, abd_size); + if (size < abd_size) { + vdev_t *vd = zio->io_vd; + + io_offset += vdev_draid_asize(vd, size); + abd_offset += size; + abd_size -= size; + nrows++; + + ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( + vd, vdev_draid_offset_to_group(vd, io_offset))); + ASSERT3U(abd_offset, <, zio->io_size); + ASSERT3U(abd_size, !=, 0); + + size = vdev_draid_map_alloc_row(zio, &rr[1], + io_offset, abd_offset, abd_size); + VERIFY3U(size, ==, abd_size); + } + + raidz_map_t *rm; + rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); + rm->rm_ops = vdev_raidz_math_get_ops(); + rm->rm_nrows = nrows; + rm->rm_row[0] = rr[0]; + if (nrows == 2) + rm->rm_row[1] = rr[1]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_draid_vsd_ops; + + return (rm); +} + +/* + * Given an offset into a dRAID return the next group width aligned offset + * which can be used to start an allocation. + */ +static uint64_t +vdev_draid_get_astart(vdev_t *vd, const uint64_t start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); +} + +/* + * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) + * rounded down to the last full slice. So each child must provide at least + * 1 / (children - nspares) of its asize. + */ +static uint64_t +vdev_draid_min_asize(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +/* + * When using dRAID the minimum allocation size is determined by the number + * of data disks in the redundancy group. Full stripes are always used. + */ +static uint64_t +vdev_draid_min_alloc(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (vdc->vdc_ndata << vd->vdev_ashift); +} + +/* + * Returns true if the txg range does not exist on any leaf vdev. + * + * A dRAID spare does not fit into the DTL model. While it has child vdevs + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Essentially we do a vdev_dtl_reassess() on the + * fly by replacing a dRAID spare with the child vdev under the offset. + * Note that it is a recursive process because the child vdev can be + * another dRAID spare and so on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child + * contains the txg range the data it is not missing. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, physical_offset, + txg, size)) + return (B_FALSE); + } + + return (B_TRUE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_missing(vd, physical_offset, + txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Returns true if the txg is only partially replicated on the leaf vdevs. + */ +static boolean_t +vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child is + * missing the txg range then it is partially replicated. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + } + + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_partial(vd, physical_offset, txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Determine if the vdev is readable at the given offset. + */ +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_readable(cvd, physical_offset)) + return (B_TRUE); + } + + return (B_FALSE); + } + + return (vdev_readable(vd)); +} + +/* + * Returns the first distributed spare found under the provided vdev tree. + */ +static vdev_t * +vdev_draid_find_spare(vdev_t *vd) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); + if (svd != NULL) + return (svd); + } + + return (NULL); +} + +/* + * Returns B_TRUE if the passed in vdev is currently "faulted". + * Faulted, in this context, means that the vdev represents a + * replacing or sparing vdev tree. + */ +static boolean_t +vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + + /* + * After resolving the distributed spare to a leaf vdev + * check the parent to determine if it's "faulted". + */ + vd = vd->vdev_parent; + } + + return (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Determine if the dRAID block at the logical offset is degraded. + * Used by sequential resilver. + */ +static boolean_t +vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Group contains a faulted vdev. */ + if (vdev_draid_faulted(cvd, physical_offset)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Determine if the txg is missing. Used by healing resilver. + */ +static boolean_t +vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, + uint64_t size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Transaction group is known to be partially replicated. */ + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Find the smallest child asize and largest sector size to calculate the + * available capacity. Distributed spares are ignored since their capacity + * is also based of the minimum child size in the top-level dRAID. + */ +static void +vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, + uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) +{ + uint64_t logical_ashift = 0, physical_ashift = 0; + uint64_t asize = 0, max_asize = 0; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + + asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; + max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; + logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); + physical_ashift = MAX(physical_ashift, + cvd->vdev_physical_ashift); + } + + *asizep = asize; + *max_asizep = max_asize; + *logical_ashiftp = logical_ashift; + *physical_ashiftp = physical_ashift; +} + +/* + * Open spare vdevs. + */ +static boolean_t +vdev_draid_open_spares(vdev_t *vd) +{ + return (vd->vdev_ops == &vdev_draid_spare_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Open all children, excluding spares. + */ +static boolean_t +vdev_draid_open_children(vdev_t *vd) +{ + return (!vdev_draid_open_spares(vd)); +} + +/* + * Open a top-level dRAID vdev. + */ +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t nparity = vdc->vdc_nparity; + int open_errors = 0; + + if (nparity > VDEV_DRAID_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * First open the normal children then the distributed spares. This + * ordering is important to ensure the distributed spares calculate + * the correct psize in the event that the dRAID vdevs were expanded. + */ + vdev_open_children_subset(vd, vdev_draid_open_children); + vdev_open_children_subset(vd, vdev_draid_open_spares); + + /* Verify enough of the children are available to continue. */ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + } + } + + /* + * Allocatable capacity is the sum of the space on all children less + * the number of distributed spares rounded down to last full row + * and then to the last full group. An additional 32MB of scratch + * space is reserved at the end of each child for use by the dRAID + * expansion feature. + */ + uint64_t child_asize, child_max_asize; + vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, + logical_ashift, physical_ashift); + + /* + * Should be unreachable since the minimum child size is 64MB, but + * we want to make sure an underflow absolutely cannot occur here. + */ + if (child_asize < VDEV_DRAID_REFLOW_RESERVE || + child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { + return (SET_ERROR(ENXIO)); + } + + child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + + *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + + return (0); +} + +/* + * Close a top-level dRAID vdev. + */ +static void +vdev_draid_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. A dRAID chunks may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span dRAID redundancy groups. + */ +static uint64_t +vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t ashift = vd->vdev_ashift; + uint64_t ndata = vdc->vdc_ndata; + uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), + SPA_MAXBLOCKSIZE); + + ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); + ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); + + /* Chunks must evenly span all data columns in the group. */ + psize = (((psize >> ashift) / ndata) * ndata) << ashift; + uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); + + /* Reduce the chunk size to the group space remaining. */ + uint64_t group = vdev_draid_offset_to_group(vd, start); + uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; + chunk_size = MIN(chunk_size, left); + + ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, + vdev_draid_offset_to_group(vd, start + chunk_size - 1)); + + return (chunk_size); +} + +/* + * Align the start of the metaslab to the group width and slightly reduce + * its size to a multiple of the group width. Since full stripe writes are + * required by dRAID this space is unallocable. Furthermore, aligning the + * metaslab start is important for vdev initialize and TRIM which both operate + * on metaslab boundaries which vdev_xlate() expects to be aligned. + */ +static void +vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; + uint64_t astart = vdev_draid_get_astart(vd, *ms_start); + uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; + + *ms_start = astart; + *ms_size = asize; + + ASSERT0(*ms_start % sz); + ASSERT0(*ms_size % sz); +} + +/* + * Add virtual dRAID spares to the list of valid spares. In order to accomplish + * this the existing array must be freed and reallocated with the additional + * entries. + */ +int +vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, + uint64_t next_vdev_id) +{ + uint64_t draid_nspares = 0; + uint64_t ndraid = 0; + int error; + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + + if (cvd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = cvd->vdev_tsd; + draid_nspares += vdc->vdc_nspares; + ndraid++; + } + } + + if (draid_nspares == 0) { + *ndraidp = ndraid; + return (0); + } + + nvlist_t **old_spares, **new_spares; + uint_t old_nspares; + error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &old_spares, &old_nspares); + if (error) + old_nspares = 0; + + /* Allocate memory and copy of the existing spares. */ + new_spares = kmem_alloc(sizeof (nvlist_t *) * + (draid_nspares + old_nspares), KM_SLEEP); + for (uint_t i = 0; i < old_nspares; i++) + new_spares[i] = fnvlist_dup(old_spares[i]); + + /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ + uint64_t n = old_nspares; + for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { + vdev_t *cvd = vd->vdev_child[vdev_id]; + char path[64]; + + if (cvd->vdev_ops != &vdev_draid_ops) + continue; + + vdev_draid_config_t *vdc = cvd->vdev_tsd; + uint64_t nspares = vdc->vdc_nspares; + uint64_t nparity = vdc->vdc_nparity; + + for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { + bzero(path, sizeof (path)); + (void) snprintf(path, sizeof (path) - 1, + "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, + (u_longlong_t)nparity, + (u_longlong_t)next_vdev_id + vdev_id, + (u_longlong_t)spare_id); + + nvlist_t *spare = fnvlist_alloc(); + fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, + cvd->vdev_guid); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, + spare_id); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, + cvd->vdev_ashift); + + new_spares[n] = spare; + n++; + } + } + + if (n > 0) { + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + new_spares, n); + } + + for (int i = 0; i < n; i++) + nvlist_free(new_spares[i]); + + kmem_free(new_spares, sizeof (*new_spares) * n); + *ndraidp = ndraid; + + return (0); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. + */ +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = vdev_draid_asize(vd, psize); + + if (phys_birth == TXG_UNKNOWN) { + /* + * Sequential resilver. There is no meaningful phys_birth + * for this block, we can only determine if block resides + * in a degraded group in which case it must be resilvered. + */ + ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, + vdev_draid_offset_to_group(vd, offset + asize - 1)); + + return (vdev_draid_group_degraded(vd, offset)); + } else { + /* + * Healing resilver. TXGs not in DTL_PARTIAL are intact, + * as are blocks in non-degraded groups. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) + return (B_TRUE); + + /* The block may span groups in which case check both. */ + if (vdev_draid_offset_to_group(vd, offset) != + vdev_draid_offset_to_group(vd, offset + asize - 1)) { + if (vdev_draid_group_missing(vd, + offset + asize, phys_birth, 1)) + return (B_TRUE); + } + + return (B_FALSE); + } +} + +static boolean_t +vdev_draid_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_draid_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static void +vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +{ +#ifdef ZFS_DEBUG + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_draid_asize(vd, rr->rr_size); + + raidz_col_t *rc = &rr->rr_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); +#endif +} + +/* + * For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() + * if a skip sector needs to be added to a column. + */ +static void +vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * Empty columns are zero filled and included in the parity + * calculation and therefore must be written. + */ + ASSERT3U(rc->rc_size, !=, 0); + + /* Verify physical to logical translation */ + vdev_draid_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } +} + +/* + * For read operations: + * 1. The vdev_draid_map_alloc() function will create a minimal raidz + * mapping for the read based on the zio->io_flags. There are two + * possible mappings either 1) a normal read, or 2) a scrub/resilver. + * 2. Create the zio read operations. This will include all parity + * columns and skip sectors for a scrub/resilver. + */ +static void +vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* Sequential rebuild must do IO at redundancy group boundary. */ + IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last. Any errors along the way will force us to read the parity. + * For scrub/resilver IOs which verify skip sectors, a gang ABD will + * have been allocated to store them and rc->rc_size is increased. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; + rc->rc_skipped = 1; + continue; + } + + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + + /* + * Empty columns may be read during vdev_draid_io_done(). + * Only skip them after the readable and missing checks + * verify they are available. + */ + if (rc->rc_size == 0) { + rc->rc_skipped = 1; + continue; + } + + if (zio->io_flags & ZIO_FLAG_RESILVER) { + vdev_t *svd; + + /* + * If this child is a distributed spare then the + * offset might reside on the vdev being replaced. + * In which case this data must be written to the + * new device. Failure to do so would result in + * checksum errors when the old device is detached + * and the pool is scrubbed. + */ + if ((svd = vdev_draid_find_spare(cvd)) != NULL) { + svd = vdev_draid_spare_get_child(svd, + rc->rc_offset); + if (svd && (svd->vdev_ops == &vdev_spare_ops || + svd->vdev_ops == &vdev_replacing_ops)) { + rc->rc_repair = 1; + } + } + + /* + * Always issue a repair IO to this child when its + * a spare or replacing vdev with an active rebuild. + */ + if ((cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_replacing_ops) && + vdev_draid_rebuilding(cvd)) { + rc->rc_repair = 1; + } + } + } + + /* + * Either a parity or data column is missing this means a repair + * may be attempted by vdev_draid_io_done(). Expand the raid map + * to read in empty columns which are needed along with the parity + * during reconstruction. + */ + if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && + rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { + vdev_draid_map_alloc_empty(zio, rr); + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + +/* + * Start an IO operation to a dRAID vdev. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd __maybe_unused = zio->io_vd; + raidz_map_t *rm; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); + + rm = vdev_draid_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_write(zio, rm->rm_row[i]); + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_read(zio, rm->rm_row[i]); + } + } + + zio_execute(zio); +} + +/* + * Complete an IO operation on a dRAID vdev. The raidz logic can be applied + * to dRAID since the layout is fully described by the raidz_map_t. + */ +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_raidz_io_done(zio); +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + ASSERT(vd->vdev_ops == &vdev_draid_ops); + + if (faulted > vdc->vdc_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +static void +vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_draid_ops); + + vdev_draid_config_t *vdc = raidvd->vdev_tsd; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* Make sure the offsets are block-aligned */ + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + + uint64_t logical_start = logical_rs->rs_start; + uint64_t logical_end = logical_rs->rs_end; + + /* + * Unaligned ranges must be skipped. All metaslabs are correctly + * aligned so this should not happen, but this case is handled in + * case it's needed by future callers. + */ + uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); + if (astart != logical_start) { + physical_rs->rs_start = logical_start; + physical_rs->rs_end = logical_start; + remain_rs->rs_start = MIN(astart, logical_end); + remain_rs->rs_end = logical_end; + return; + } + + /* + * Unlike with mirrors and raidz a dRAID logical range can map + * to multiple non-contiguous physical ranges. This is handled by + * limiting the size of the logical range to a single group and + * setting the remain argument such that it describes the remaining + * unmapped logical range. This is stricter than absolutely + * necessary but helps simplify the logic below. + */ + uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); + uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); + if (logical_end > nextstart) + logical_end = nextstart; + + /* Find the starting offset for each vdev in the group */ + uint64_t perm, groupstart; + uint64_t start = vdev_draid_logical_to_physical(raidvd, + logical_start, &perm, &groupstart); + uint64_t end = start; + + uint8_t *base; + uint64_t iter, id; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + /* + * Check if the passed child falls within the group. If it does + * update the start and end to reflect the physical range. + * Otherwise, leave them unmodified which will result in an empty + * (zero-length) physical range being returned. + */ + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + + if (c == 0 && i != 0) { + /* the group wrapped, increment the start */ + start += VDEV_DRAID_ROWHEIGHT; + end = start; + } + + id = vdev_draid_permute_id(vdc, base, iter, c); + if (id == cvd->vdev_id) { + uint64_t b_size = (logical_end >> ashift) - + (logical_start >> ashift); + ASSERT3U(b_size, >, 0); + end = start + ((((b_size - 1) / + vdc->vdc_groupwidth) + 1) << ashift); + break; + } + } + physical_rs->rs_start = start; + physical_rs->rs_end = end; + + /* + * Only top-level vdevs are allowed to set remain_rs because + * when .vdev_op_xlate() is called for their children the full + * logical range is not provided by vdev_xlate(). + */ + remain_rs->rs_start = logical_end; + remain_rs->rs_end = logical_rs->rs_end; + + ASSERT3U(physical_rs->rs_start, <=, logical_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_end - logical_start); +} + +/* + * Add dRAID specific fields to the config nvlist. + */ +static void +vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + vdev_draid_config_t *vdc = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); +} + +/* + * Initialize private dRAID specific fields from the nvlist. + */ +static int +vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + uint64_t ndata, nparity, nspares, ngroups; + int error; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || + nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + return (SET_ERROR(EINVAL)); + } + + uint_t children; + nvlist_t **child; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children == 0 || + children > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100 || nspares > (children - (ndata + nparity))) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + /* + * Validate the minimum number of children exist per group for the + * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). + */ + if (children < (ndata + nparity + nspares)) + return (SET_ERROR(EINVAL)); + + /* + * Create the dRAID configuration using the pool nvlist configuration + * and the fixed mapping for the correct number of children. + */ + vdev_draid_config_t *vdc; + const draid_map_t *map; + + error = vdev_draid_lookup_map(children, &map); + if (error) + return (SET_ERROR(EINVAL)); + + vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); + vdc->vdc_ndata = ndata; + vdc->vdc_nparity = nparity; + vdc->vdc_nspares = nspares; + vdc->vdc_children = children; + vdc->vdc_ngroups = ngroups; + vdc->vdc_nperms = map->dm_nperms; + + error = vdev_draid_generate_perms(map, &vdc->vdc_perms); + if (error) { + kmem_free(vdc, sizeof (*vdc)); + return (SET_ERROR(EINVAL)); + } + + /* + * Derived constants. + */ + vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; + vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; + vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; + vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / + vdc->vdc_ndisks; + + ASSERT3U(vdc->vdc_groupwidth, >=, 2); + ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); + ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); + ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % + vdc->vdc_ndisks, ==, 0); + + *tsd = vdc; + + return (0); +} + +static void +vdev_draid_fini(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + vmem_free(vdc->vdc_perms, sizeof (uint8_t) * + vdc->vdc_children * vdc->vdc_nperms); + kmem_free(vdc, sizeof (*vdc)); +} + +static uint64_t +vdev_draid_nparity(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_nparity); +} + +static uint64_t +vdev_draid_ndisks(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_ndisks); +} + +vdev_ops_t vdev_draid_ops = { + .vdev_op_init = vdev_draid_init, + .vdev_op_fini = vdev_draid_fini, + .vdev_op_open = vdev_draid_open, + .vdev_op_close = vdev_draid_close, + .vdev_op_asize = vdev_draid_asize, + .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_alloc = vdev_draid_min_alloc, + .vdev_op_io_start = vdev_draid_io_start, + .vdev_op_io_done = vdev_draid_io_done, + .vdev_op_state_change = vdev_draid_state_change, + .vdev_op_need_resilver = vdev_draid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_draid_xlate, + .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, + .vdev_op_metaslab_init = vdev_draid_metaslab_init, + .vdev_op_config_generate = vdev_draid_config_generate, + .vdev_op_nparity = vdev_draid_nparity, + .vdev_op_ndisks = vdev_draid_ndisks, + .vdev_op_type = VDEV_TYPE_DRAID, + .vdev_op_leaf = B_FALSE, +}; + + +/* + * A dRAID distributed spare is a virtual leaf vdev which is included in the + * parent dRAID configuration. The last N columns of the dRAID permutation + * table are used to determine on which dRAID children a specific offset + * should be written. These spare leaf vdevs can only be used to replace + * faulted children in the same dRAID configuration. + */ + +/* + * Distributed spare state. All fields are set when the distributed spare is + * first opened and are immutable. + */ +typedef struct { + vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ + uint64_t vds_top_guid; /* top-level parent dRAID guid */ + uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ +} vdev_draid_spare_t; + +/* + * Returns the parent dRAID vdev to which the distributed spare belongs. + * This may be safely called even when the vdev is not open. + */ +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + if (vds->vds_draid_vdev != NULL) + return (vds->vds_draid_vdev); + + return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, + vds->vds_top_guid)); +} + +/* + * A dRAID space is active when it's the child of a vdev using the + * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. + */ +static boolean_t +vdev_draid_spare_is_active(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_draid_ops)) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* + * Given a dRAID distribute spare vdev, returns the physical child vdev + * on which the provided offset resides. This may involve recursing through + * multiple layers of distributed spares. Note that offset is relative to + * this vdev. + */ +vdev_t * +vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + /* The vdev is closed */ + if (vds->vds_draid_vdev == NULL) + return (NULL); + + vdev_t *tvd = vds->vds_draid_vdev; + vdev_draid_config_t *vdc = tvd->vdev_tsd; + + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + + uint8_t *base; + uint64_t iter; + uint64_t perm = physical_offset / vdc->vdc_devslicesz; + + vdev_draid_get_perm(vdc, perm, &base, &iter); + + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, + (tvd->vdev_children - 1) - vds->vds_spare_id); + vdev_t *cvd = tvd->vdev_child[cid]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_get_child(cvd, physical_offset)); + + return (cvd); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_close(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vds->vds_draid_vdev = NULL; +} + +/* + * Opening a dRAID spare device is done by looking up the associated dRAID + * top-level vdev guid from the spare configuration. + */ +static int +vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + uint64_t asize, max_asize; + + vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); + if (tvd == NULL) { + /* + * When spa_vdev_add() is labeling new spares the + * associated dRAID is not attached to the root vdev + * nor does this spare have a parent. Simulate a valid + * device in order to allow the label to be initialized + * and the distributed spare added to the configuration. + */ + if (vd->vdev_parent == NULL) { + *psize = *max_psize = SPA_MINDEVSIZE; + *logical_ashift = *physical_ashift = ASHIFT_MIN; + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + vdev_draid_config_t *vdc = tvd->vdev_tsd; + if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) + return (SET_ERROR(EINVAL)); + + if (vds->vds_spare_id >= vdc->vdc_nspares) + return (SET_ERROR(EINVAL)); + + /* + * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here + * because the caller may be vdev_draid_open() in which case the + * values are stale as they haven't yet been updated by vdev_open(). + * To avoid this always recalculate the dRAID asize and max_asize. + */ + vdev_draid_calculate_asize(tvd, &asize, &max_asize, + logical_ashift, physical_ashift); + + *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + + vds->vds_draid_vdev = tvd; + + return (0); +} + +/* + * Completed distributed spare IO. Store the result in the parent zio + * as if it had performed the operation itself. Only the first error is + * preserved if there are multiple errors. + */ +static void +vdev_draid_spare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + /* + * IOs are issued to non-writable vdevs in order to keep their + * DTLs accurate. However, we don't want to propagate the + * error in to the distributed spare's DTL. When resilvering + * vdev_draid_need_resilver() will consult the relevant DTL + * to determine if the data is missing and must be repaired. + */ + if (!vdev_writeable(zio->io_vd)) + return; + + if (pio->io_error == 0) + pio->io_error = zio->io_error; +} + +/* + * Returns a valid label nvlist for the distributed spare vdev. This is + * used to bypass the IO pipeline to avoid the complexity of constructing + * a complete label with valid checksum to return when read. + */ +nvlist_t * +vdev_draid_read_config_spare(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + uint64_t guid = vd->vdev_guid; + + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, + vdev_draid_spare_is_active(vd) ? + POOL_STATE_ACTIVE : POOL_STATE_SPARE); + + /* Set the vdev guid based on the vdev list in sav_count. */ + for (int i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + return (nv); +} + +/* + * Handle any ioctl requested of the distributed spare. Only flushes + * are supported in which case all children must be flushed. + */ +static int +vdev_draid_spare_ioctl(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + int error = 0; + + if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } else { + error = SET_ERROR(ENOTSUP); + } + + return (error); +} + +/* + * Initiate an IO to the distributed spare. For normal IOs this entails using + * the zio->io_offset and permutation table to calculate which child dRAID vdev + * is responsible for the data. Then passing along the zio to that child to + * perform the actual IO. The label ranges are not stored on disk and require + * some special handling which is described below. + */ +static void +vdev_draid_spare_io_start(zio_t *zio) +{ + vdev_t *cvd = NULL, *vd = zio->io_vd; + vdev_draid_spare_t *vds = vd->vdev_tsd; + uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vds == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + zio->io_error = vdev_draid_spare_ioctl(zio); + break; + + case ZIO_TYPE_WRITE: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs and config writers to simulate the + * existence of an on disk label. vdev_label_sync(), + * vdev_uberblock_sync() and vdev_copy_uberblocks() + * skip the distributed spares. This only leaves + * vdev_label_init() which is allowed to succeed to + * avoid adding special cases the function. + */ + if (zio->io_flags & ZIO_FLAG_PROBE || + zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_READ: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs to simulate the existence of a + * label. vdev_label_read_config() bypasses the + * pipeline to read the label configuration and + * vdev_uberblock_load() skips distributed spares + * when attempting to locate the best uberblock. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_TRIM: + /* The vdev label ranges are never trimmed */ + ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); + + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !cvd->vdev_has_trim) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_io_done(zio_t *zio) +{ +} + +/* + * Lookup the full spare config in spa->spa_spares.sav_config and + * return the top_guid and spare_id for the named spare. + */ +static int +vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, + uint64_t *spare_idp) +{ + nvlist_t **spares; + uint_t nspares; + int error; + + if ((spa->spa_spares.sav_config == NULL) || + (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { + return (SET_ERROR(ENOENT)); + } + + char *spare_name; + error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (int i = 0; i < nspares; i++) { + nvlist_t *spare = spares[i]; + uint64_t top_guid, spare_id; + char *type, *path; + + /* Skip non-distributed spares */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); + if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) + continue; + + /* Skip spares with the wrong name */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); + if (error != 0 || strcmp(path, spare_name) != 0) + continue; + + /* Found the matching spare */ + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_TOP_GUID, &top_guid); + if (error == 0) { + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_SPARE_ID, &spare_id); + } + + if (error != 0) { + return (SET_ERROR(EINVAL)); + } else { + *top_guidp = top_guid; + *spare_idp = spare_id; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* + * Initialize private dRAID spare specific fields from the nvlist. + */ +static int +vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_draid_spare_t *vds; + uint64_t top_guid = 0; + uint64_t spare_id; + + /* + * In the normal case check the list of spares stored in the spa + * to lookup the top_guid and spare_id for provided spare config. + * When creating a new pool or adding vdevs the spare list is not + * yet populated and the values are provided in the passed config. + */ + if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, + &spare_id) != 0) + return (SET_ERROR(EINVAL)); + } + + vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); + vds->vds_draid_vdev = NULL; + vds->vds_top_guid = top_guid; + vds->vds_spare_id = spare_id; + + *tsd = vds; + + return (0); +} + +static void +vdev_draid_spare_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); +} + +static void +vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); +} + +vdev_ops_t vdev_draid_spare_ops = { + .vdev_op_init = vdev_draid_spare_init, + .vdev_op_fini = vdev_draid_spare_fini, + .vdev_op_open = vdev_draid_spare_open, + .vdev_op_close = vdev_draid_spare_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_draid_spare_io_start, + .vdev_op_io_done = vdev_draid_spare_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_draid_spare_config_generate, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DRAID_SPARE, + .vdev_op_leaf = B_TRUE, +}; diff --git a/module/zfs/vdev_draid_rand.c b/module/zfs/vdev_draid_rand.c new file mode 100644 index 000000000..fe1a75c11 --- /dev/null +++ b/module/zfs/vdev_draid_rand.c @@ -0,0 +1,40 @@ +/* + * Xorshift Pseudo Random Number Generator based on work by David Blackman + * and Sebastiano Vigna ([email protected]). + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * http://prng.di.unimi.it/xoroshiro128plusplus.c + * + * To the extent possible under law, the author has dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * See <http://creativecommons.org/publicdomain/zero/1.0/>. + * + * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid, + * small-state generators. It is extremely (sub-ns) fast and it passes all + * tests we are aware of, but its state space is large enough only for + * mild parallelism. + */ + +#include <sys/vdev_draid.h> + +static inline uint64_t rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +uint64_t +vdev_draid_rand(uint64_t *s) +{ + const uint64_t s0 = s[0]; + uint64_t s1 = s[1]; + const uint64_t result = rotl(s0 + s1, 17) + s0; + + s1 ^= s0; + s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b + s[1] = rotl(s1, 28); // c + + return (result); +} diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 12ee393bd..009394bfe 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1844,9 +1844,13 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, @@ -1855,6 +1859,11 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 7ff7fffcc..083ad2861 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } + + vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_INITIALIZE_ACTIVE || + old_state == VDEV_INITIALIZE_SUSPENDED) + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, @@ -318,6 +322,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) } static void +vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_initialize_bytes_est += size; + + if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && + vd->vdev_initialize_last_offset < physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - physical_rs->rs_start; + } +} + +static void vdev_initialize_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || @@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been initialized */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been initialized */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_initialize_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_initialize_last_offset > last_rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd) &where)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_initialize_bytes_est += size; - if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += size; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_start && - vd->vdev_initialize_last_offset < - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += - vd->vdev_initialize_last_offset - - physical_rs.rs_start; - } + + vdev_xlate_walk(vd, &logical_rs, + vdev_initialize_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -419,55 +443,48 @@ vdev_initialize_load(vdev_t *vd) return (err); } -/* - * Convert the logical range into a physical range and add it to our - * avl tree. - */ static void -vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) { vdev_t *vd = arg; - range_seg64_t logical_rs, physical_rs; - logical_rs.rs_start = start; - logical_rs.rs_end = start + size; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) return; /* Pick up where we left off mid-range. */ - if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " "(%llu, %llu)", vd->vdev_path, - (u_longlong_t)physical_rs.rs_start, - (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)physical_rs->rs_start, + (u_longlong_t)physical_rs->rs_end, (u_longlong_t)vd->vdev_initialize_last_offset, - (u_longlong_t)physical_rs.rs_end); - ASSERT3U(physical_rs.rs_end, >, + (u_longlong_t)physical_rs->rs_end); + ASSERT3U(physical_rs->rs_end, >, vd->vdev_initialize_last_offset); - physical_rs.rs_start = vd->vdev_initialize_last_offset; + physical_rs->rs_start = vd->vdev_initialize_last_offset; } - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + +/* + * Convert the logical range into a physical range and add it to our + * avl tree. + */ +static void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg64_t logical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } static void diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index d063b77ea..fbd117d2d 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include <sys/zap.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + if (vd->vdev_ops->vdev_op_config_generate != NULL) + vd->vdev_ops->vdev_op_config_generate(vd, nv); - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) + if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); + } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); @@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + /* + * The label for a dRAID distributed spare is not stored on disk. + * Instead it is generated when needed which allows us to bypass + * the pipeline when reading the config from the label. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_read_config_spare(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd) SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); + /* + * No uberblocks are stored on distributed spares, they may be + * safely skipped when expanding a leaf vdev. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * There's no need to write uberblocks to a distributed spare, they + * are already stored on all the leaves of the parent dRAID. For + * this same reason vdev_uberblock_load_impl() skips distributed + * spares when reading uberblocks. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { @@ -1764,6 +1772,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, return; /* + * The top-level config never needs to be written to a distributed + * spare. When read vdev_dspare_label_read_config() will generate + * the config for the vdev_label_read_config(). + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + + /* * Generate a label describing the top-level config to which we belong. */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 71b5adbbd..71ca43cae 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -33,6 +33,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/zio.h> #include <sys/abd.h> #include <sys/fs/zfs.h> @@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void) /* * Virtual device vector for mirroring. */ - typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; @@ -108,6 +108,7 @@ typedef struct mirror_child { uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; + uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { @@ -115,6 +116,7 @@ typedef struct mirror_map { int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; + boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; @@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) return (load + zfs_vdev_mirror_rotating_seek_inc); } +static boolean_t +vdev_mirror_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_mirror_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. @@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio) mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (vdev_mirror_rebuilding(mc->mc_vd)) + mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } @@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read - * preferring vdevs based on determined load. + * preferring vdevs based on determined load. If we can't, try the read on + * any vdev we haven't already tried. * - * Try to find a child whose DTL doesn't contain the block we want to read. - * If we can't, try the read on any vdev we haven't already tried. + * Distributed spares are an exception to the above load rule. They are + * always preferred in order to detect gaps in the distributed spare which + * are created when another disk in the dRAID fails. In order to restore + * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) @@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } + if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { + mm->mm_preferred[0] = c; + mm->mm_preferred_cnt = 1; + break; + } + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; @@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; + c++; + + /* + * When sequentially resilvering only issue write repair + * IOs to the vdev which is being rebuilt since performance + * is limited by the slowest child. This is an issue for + * faster replacement devices such as distributed spares. + */ + if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && + (zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SCRUB) && + mm->mm_rebuilding && !mc->mc_rebuilding) { + continue; + } + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); - c++; } zio_execute(zio); @@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_error == 0) { + vdev_ops_t *ops = mc->mc_vd->vdev_ops; + if (mc->mc_tried) continue; /* @@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio) * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - - * 2. it's an indirect vdev (in which case - * it could point to any other vdev, which - * might have a bad DTL) + * 2. it's an indirect or distributed spare + * vdev (in which case it could point to any + * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - mc->mc_vd->vdev_ops != &vdev_indirect_ops && + ops != &vdev_indirect_ops && + ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; @@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } } +/* + * Return the maximum asize for a rebuild zio in the provided range. + */ +static uint64_t +vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + vdev_ops_t vdev_mirror_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index ce90df6e8..e9145fd01 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index a8ef3d747..45d92819d 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -593,6 +593,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); + /* + * I/Os to distributed spares are directly dispatched to the dRAID + * leaf vdevs for aggregation. See the comment at the end of the + * zio_vdev_io_start() function. + */ + ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); + first = last = zio; if (zio->io_type == ZIO_TYPE_READ) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 47312e02f..989b90dc2 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include <sys/fm/fs/zfs.h> #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> +#include <sys/vdev_draid.h> #ifdef ZFS_DEBUG #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -134,25 +135,51 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } -void -vdev_raidz_map_free(raidz_map_t *rm) +static void +vdev_raidz_row_free(raidz_row_t *rr) { int c; - for (c = 0; c < rm->rm_firstdatacol; c++) { - abd_free(rm->rm_col[c].rc_abd); + for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) { + abd_free(rr->rr_col[c].rc_abd); - if (rm->rm_col[c].rc_gdata != NULL) - abd_free(rm->rm_col[c].rc_gdata); + if (rr->rr_col[c].rc_gdata != NULL) { + abd_free(rr->rr_col[c].rc_gdata); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } } + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_size != 0) { + if (abd_is_gang(rr->rr_col[c].rc_abd)) + abd_free(rr->rr_col[c].rc_abd); + else + abd_put(rr->rr_col[c].rc_abd); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } + } + + if (rr->rr_abd_copy != NULL) + abd_free(rr->rr_abd_copy); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - abd_put(rm->rm_col[c].rc_abd); + if (rr->rr_abd_empty != NULL) + abd_free(rr->rr_abd_empty); + + kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); +} - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); +void +vdev_raidz_map_free(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) + vdev_raidz_row_free(rm->rm_row[i]); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); + kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void @@ -161,10 +188,11 @@ vdev_raidz_map_free_vsd(zio_t *zio) raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); - rm->rm_freed = 1; + rm->rm_freed = B_TRUE; - if (rm->rm_reports == 0) + if (rm->rm_reports == 0) { vdev_raidz_map_free(rm); + } } /*ARGSUSED*/ @@ -175,7 +203,7 @@ vdev_raidz_cksum_free(void *arg, size_t ignored) ASSERT3U(rm->rm_reports, >, 0); - if (--rm->rm_reports == 0 && rm->rm_freed != 0) + if (--rm->rm_reports == 0 && rm->rm_freed) vdev_raidz_map_free(rm); } @@ -186,77 +214,79 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) const size_t c = zcr->zcr_cbinfo; size_t x, offset; - const abd_t *good = NULL; - const abd_t *bad = rm->rm_col[c].rc_abd; - if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } - if (c < rm->rm_firstdatacol) { + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ - if (rm->rm_col[0].rc_gdata == NULL) { + if (rr->rr_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; /* - * Set up the rm_col[]s to generate the parity for + * Set up the rr_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_abd = - rm->rm_col[x].rc_gdata = - abd_alloc_sametype(rm->rm_col[x].rc_abd, - rm->rm_col[x].rc_size); + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); } /* fill in the data columns from good_data */ offset = 0; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); - rm->rm_col[x].rc_abd = + rr->rr_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, - offset, rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + offset, rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } /* * Construct the parity from the good data. */ - vdev_raidz_generate_parity(rm); + vdev_raidz_generate_parity_row(rm, rr); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_abd = bad_parity[x]; + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset_size( - rm->rm_abd_copy, offset, - rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } } - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, - rm->rm_col[c].rc_size); + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ offset = 0; - for (x = rm->rm_firstdatacol; x < c; x++) - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < c; x++) + offset += rr->rr_col[x].rc_size; good = abd_get_offset_size((abd_t *)good_data, offset, - rm->rm_col[c].rc_size); + rr->rr_col[c].rc_size); } /* we drop the ereport if it ends up that the data was good */ @@ -274,10 +304,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - size_t offset; - raidz_map_t *rm = zio->io_vsd; - size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; @@ -287,8 +314,9 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); + ASSERT3U(rm->rm_nrows, ==, 1); - if (rm->rm_abd_copy != NULL) + if (rm->rm_row[0]->rr_abd_copy != NULL) return; /* @@ -299,26 +327,30 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) * Our parity data is already in separate buffers, so there's no need * to copy them. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) + size += rr->rr_col[c].rc_size; - rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, - col->rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); - abd_copy(tmp, col->rc_abd, col->rc_size); + abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); - col->rc_abd = tmp; + abd_put(col->rc_abd); + col->rc_abd = tmp; - offset += col->rc_size; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); } - ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -337,7 +369,7 @@ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { - raidz_map_t *rm; + raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ @@ -349,6 +381,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); + rm->rm_nrows = 1; + /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. @@ -370,8 +406,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); - /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ + /* + * acols: The columns that will be accessed. + * scols: The columns that will be accessed or skipped. + */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; @@ -383,65 +421,70 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); - - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rm->rm_row[0] = rr; + + rr->rr_cols = acols; + rr->rr_scols = scols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; +#ifdef ZFS_DEBUG + rr->rr_offset = zio->io_offset; + rr->rr_size = zio->io_size; +#endif asize = 0; for (c = 0; c < scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; + rc->rc_devidx = col; + rc->rc_offset = coff; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; if (c >= acols) - rm->rm_col[c].rc_size = 0; + rc->rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << ashift; + rc->rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << ashift; + rc->rc_size = q << ashift; - asize += rm->rm_col[c].rc_size; + asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); - rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); - ASSERT3U(rm->rm_nskip, <=, nparity); + rm->rm_skipstart = bc; - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); + for (c = 0; c < rr->rr_firstdatacol; c++) + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rm->rm_col[c].rc_size); - off = rm->rm_col[c].rc_size; + rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rr->rr_col[c].rc_size); + off = rr->rr_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, - rm->rm_col[c].rc_size); - off += rm->rm_col[c].rc_size; + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size); + off += rc->rc_size; } /* @@ -464,24 +507,21 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * skip the first column since at least one data and one parity * column must appear in each row. */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; + if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rr->rr_col[0].rc_devidx; + o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -550,50 +590,43 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) } static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p(raidz_row_t *rr) { - uint64_t *p; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq(raidz_row_t *rr) { - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } @@ -601,14 +634,15 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } @@ -616,33 +650,29 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_R].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); + (void) memcpy(r, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; @@ -651,14 +681,15 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } @@ -671,27 +702,38 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * parity columns available. */ void -vdev_raidz_generate_parity(raidz_map_t *rm) +vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { + ASSERT3U(rr->rr_cols, !=, 0); + /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; - switch (rm->rm_firstdatacol) { + switch (rr->rr_firstdatacol) { case 1: - vdev_raidz_generate_parity_p(rm); + vdev_raidz_generate_parity_p(rr); break; case 2: - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); break; case 3: - vdev_raidz_generate_parity_pqr(rm); + vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } +void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_generate_parity_row(rm, rr); + } +} + /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) @@ -809,30 +851,27 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) } static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; - int c; abd_t *dst, *src; - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); + ASSERT3U(ntgts, ==, 1); + ASSERT3U(x, >=, rr->rr_firstdatacol); + ASSERT3U(x, <, rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); + ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + dst = rr->rr_col[x].rc_abd; - abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); + abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; if (c == x) continue; @@ -845,7 +884,7 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) } static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; @@ -853,44 +892,44 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 1); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) + if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); - + rr->rr_col[x].rc_size - size); + } } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); + ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, + size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); + src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + dst = rr->rr_col[x].rc_abd; + exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; @@ -901,10 +940,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 2); ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); + ASSERT(x >= rr->rr_firstdatacol); + ASSERT(y < rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as @@ -913,29 +952,29 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; + pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rr->rr_col[x].rc_size; + ysize = rr->rr_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rr->rr_col[x].rc_size = 0; + rr->rr_col[y].rc_size = 0; - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; + rr->rr_col[x].rc_size = xsize; + rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; + pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + xd = rr->rr_col[x].rc_abd; + yd = rr->rr_col[y].rc_abd; /* * We now have: @@ -953,7 +992,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) */ a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; @@ -967,14 +1006,14 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1134,13 +1173,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) /* END CSTYLED */ static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, +vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. @@ -1164,7 +1203,7 @@ vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, } static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, +vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; @@ -1176,10 +1215,10 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, * correspond to data columns. */ for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); + ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); + ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* @@ -1196,8 +1235,8 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; + ASSERT3U(used[j], >=, rr->rr_firstdatacol); + jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; @@ -1258,7 +1297,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, } static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, +vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; @@ -1290,22 +1329,24 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, for (i = 0; i < n; i++) { c = used[i]; - ASSERT3U(c, <, rm->rm_cols); + ASSERT3U(c, <, rr->rr_cols); - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; + ccount = rr->rr_col[c].rc_size; + ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); + if (ccount == 0) + continue; + src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); + cc = missing[j] + rr->rr_firstdatacol; + ASSERT3U(cc, >=, rr->rr_firstdatacol); + ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; + dcount[j] = rr->rr_col[cc].rc_size; + if (dcount[j] != 0) + dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; @@ -1334,16 +1375,14 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, } static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; - uint8_t *p, *pp; size_t psize; - uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; @@ -1354,30 +1393,39 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate - * temporary linear ABDs. + * temporary linear ABDs if any non-linear ABDs are found. */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + if (!abd_is_linear(rr->rr_col[i].rc_abd)) { + bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), + KM_PUSHPAGE); + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + + bufs[c] = col->rc_abd; + if (bufs[c] != NULL) { + col->rc_abd = abd_alloc_linear( + col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], + col->rc_size); + } + } - bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + break; } } - n = rm->rm_cols - rm->rm_firstdatacol; + n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { + if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; + tgts[t] - rr->rr_firstdatacol; } } @@ -1387,7 +1435,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); + ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. @@ -1422,9 +1470,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) used[i] = parity_map[i]; } - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { + c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } @@ -1437,18 +1485,18 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Initialize the interesting rows of the matrix. */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); @@ -1457,21 +1505,24 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) * copy back from temporary linear abds and free them */ if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); + if (bufs[c] != NULL) { + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + } col->rc_abd = bufs[c]; } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } return (code); } -int -vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +static int +vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, + const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; @@ -1480,26 +1531,19 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; - /* - * The tgts list must already be sorted. - */ - for (i = 1; i < nt; i++) { - ASSERT(t[i] > t[i - 1]); - } - - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; + nbadparity = rr->rr_firstdatacol; + nbaddata = rr->rr_cols - nbadparity; ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) + for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; - } else if (rm->rm_col[c].rc_error != 0) { + } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { + } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; @@ -1514,7 +1558,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return (ret); @@ -1524,29 +1568,29 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + return (vdev_raidz_reconstruct_p(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + return (vdev_raidz_reconstruct_q(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; case 2: - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + return (vdev_raidz_reconstruct_pq(rr, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; } - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); @@ -1556,8 +1600,8 @@ static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; @@ -1573,7 +1617,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; @@ -1602,19 +1646,20 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, static void vdev_raidz_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t cols = vdrz->vd_logical_width; + uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -1623,7 +1668,18 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +/* + * The allocatable space for a raidz vdev is N * sizeof(smallest child) + * so each child must provide at least 1/Nth of its asize. + */ +static uint64_t +vdev_raidz_min_asize(vdev_t *vd) +{ + return ((vd->vdev_min_asize + vd->vdev_children - 1) / + vd->vdev_children); +} + +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1634,21 +1690,21 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg64_t logical_rs, physical_rs; - logical_rs.rs_start = zio->io_offset; + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, zio->io_size); + vdev_raidz_asize(vd, rr->rr_size); - raidz_col_t *rc = &rm->rm_col[col]; + raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - vdev_xlate(cvd, &logical_rs, &physical_rs); + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1666,106 +1722,82 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) #endif } -/* - * Start an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Generate the parity data - * 2. Create child zio write operations to each column's vdev, for both - * data and parity. - * 3. If the column skips any sectors for padding, create optional dummy - * write zio children for those areas to improve aggregation continuity. - * - For read operations: - * 1. Create child zio read operations to each data column's vdev to read - * the range of data required for zio. - * 2. If this is a scrub or resilver operation, or if any of the data - * vdevs have had errors, then create zio read operations to the parity - * columns' VDevs as well. - */ static void -vdev_raidz_io_start(zio_t *zio) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; + raidz_map_t *rm = zio->io_vsd; int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + vdev_raidz_generate_parity_row(rm, rr); - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Verify physical to logical translation. - */ - vdev_raidz_io_verify(zio, rm, c); + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } + /* Verify physical to logical translation */ + vdev_raidz_io_verify(vd, rr, c); - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } - zio_execute(zio); - return; + /* + * Generate optional I/Os for skip sectors to improve aggregation + * contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rr->rr_scols); + if (c == rr->rr_scols) + c = 0; + + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } +} - ASSERT(zio->io_type == ZIO_TYPE_READ); +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; else - rm->rm_missingparity++; + rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; else - rm->rm_missingparity++; + rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1773,11 +1805,56 @@ vdev_raidz_io_start(zio_t *zio) vdev_raidz_child_done, rc)); } } +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, + vdrz->vd_logical_width, vdrz->vd_nparity); + + /* + * Until raidz expansion is implemented all maps for a raidz vdev + * contain a single row. + */ + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + vdev_raidz_io_start_read(zio, rr); + } zio_execute(zio); } - /* * Report a checksum error for a child of a RAID-Z device. */ @@ -1786,7 +1863,8 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; @@ -1827,13 +1905,14 @@ raidz_checksum_verify(zio_t *zio) * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. + * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; + raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; @@ -1843,8 +1922,18 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; + /* + * All data columns must have been successfully read in order + * to use them to generate parity columns for comparison. + */ + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + rc = &rr->rr_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + return (ret); + } + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; @@ -1852,12 +1941,19 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_copy(orig[c], rc->rc_abd, rc->rc_size); } - vdev_raidz_generate_parity(rm); + /* + * Regenerates parity even for !tried||rc_error!=0 columns. This + * isn't harmful but it does have the side effect of fixing stuff + * we didn't realize was necessary (i.e. even if we return 0). + */ + vdev_raidz_generate_parity_row(rm, rr); + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; + if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); @@ -1870,456 +1966,597 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) } static int -vdev_raidz_worst_error(raidz_map_t *rm) +vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); + for (int c = 0; c < rr->rr_cols; c++) + error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } -/* - * Iterate over all combinations of bad data and attempt a reconstruction. - * Note that the algorithm below is non-optimal because it doesn't take into - * account how reconstruction is actually performed. For example, with - * triple-parity RAID-Z the reconstruction procedure is the same if column 4 - * is targeted as invalid as if columns 1 and 4 are targeted since in both - * cases we'd only use parity information in column 0. - */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +static void +vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - abd_t *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int curr, next, i, c, n; - int code, ret = 0; + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; - ASSERT(total_errors < rm->rm_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + if (c < rr->rr_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } /* - * This simplifies one edge condition. + * If we read more parity disks than were used for + * reconstruction, confirm that the other parity disks produced + * correct data. + * + * Note that we also regenerate parity when resilvering so we + * can write it out to failed devices later. */ - tgts[-1] = -1; + if (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + int n = raidz_parity_verify(zio, rr); + unexpected_errors += n; + ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); + } - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. + * Use the good data we have in hand to repair damaged children. */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if ((rc->rc_error == 0 || rc->rc_size == 0) && + (rc->rc_repair == 0)) { + continue; } - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + +static void +raidz_restore_orig_data(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + abd_copy_from_buf(rc->rc_abd, + rc->rc_orig_data, rc->rc_size); + rc->rc_need_orig_restore = B_FALSE; } + } + } +} + +/* + * returns EINVAL if reconstruction of the block will not be possible + * returns ECKSUM if this specific reconstruction failed + * returns 0 on successful reconstruction + */ +static int +raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) +{ + raidz_map_t *rm = zio->io_vsd; - tgts[i] = c++; + /* Reconstruct each row */ + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ + int t = 0; + int dead = 0; + int dead_data = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT0(rc->rc_need_orig_restore); + if (rc->rc_error != 0) { + dead++; + if (c >= nparity) + dead_data++; + continue; + } + if (rc->rc_size == 0) + continue; + for (int lt = 0; lt < ntgts; lt++) { + if (rc->rc_devidx == ltgts[lt]) { + if (rc->rc_orig_data == NULL) { + rc->rc_orig_data = + zio_buf_alloc(rc->rc_size); + abd_copy_to_buf( + rc->rc_orig_data, + rc->rc_abd, rc->rc_size); + } + rc->rc_need_orig_restore = B_TRUE; + + dead++; + if (c >= nparity) + dead_data++; + my_tgts[t++] = c; + break; + } + } + } + if (dead > nparity) { + /* reconstruction not possible */ + raidz_restore_orig_data(rm); + return (EINVAL); } + rr->rr_code = 0; + if (dead_data > 0) + rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, + my_tgts, t); + } - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; + /* Check for success */ + if (raidz_checksum_verify(zio) == 0) { + + /* Reconstruction succeeded - report errors */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + /* + * Note: if this is a parity column, + * we don't really know if it's wrong. + * We need to let + * vdev_raidz_io_done_verified() check + * it, and if we set rc_error, it will + * think that it is a "known" error + * that doesn't need to be checked + * or corrected. + */ + if (rc->rc_error == 0 && + c >= rr->rr_firstdatacol) { + raidz_checksum_error(zio, + rc, rc->rc_gdata); + rc->rc_error = + SET_ERROR(ECKSUM); + } + rc->rc_need_orig_restore = B_FALSE; + } + } - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); + vdev_raidz_io_done_verified(zio, rr); } - orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, - rm->rm_col[0].rc_size); + zio_checksum_verified(zio); - curr = 0; - next = tgts[curr]; + return (0); + } - while (curr != n) { - tgts[curr] = next; - curr = 0; + /* Reconstruction failed - restore original data */ + raidz_restore_orig_data(rm); + return (ECKSUM); +} - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy(orig[i], rc->rc_abd, rc->rc_size); - } +/* + * Iterate over all combinations of N bad vdevs and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + * + * The order that we find the various possible combinations of failed + * disks is dictated by these rules: + * - Examine each "slot" (the "i" in tgts[i]) + * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - if we can't increment because it runs into the next slot, + * reset our slot to the minimum, and examine the next slot + * + * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose + * 3 columns to reconstruct), we will generate the following sequence: + * + * STATE ACTION + * 0 1 2 special case: skip since these are all parity + * 0 1 3 first slot: reset to 0; middle slot: increment to 2 + * 0 2 3 first slot: increment to 1 + * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 + * 0 1 4 first: reset to 0; middle: increment to 2 + * 0 2 4 first: increment to 1 + * 1 2 4 first: reset to 0; middle: increment to 3 + * 0 3 4 first: increment to 1 + * 1 3 4 first: increment to 2 + * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 + * 0 1 5 first: reset to 0; middle: increment to 2 + * 0 2 5 first: increment to 1 + * 1 2 5 first: reset to 0; middle: increment to 3 + * 0 3 5 first: increment to 1 + * 1 3 5 first: increment to 2 + * 2 3 5 first: reset to 0; middle: increment to 4 + * 0 4 5 first: increment to 1 + * 1 4 5 first: increment to 2 + * 2 4 5 first: increment to 3 + * 3 4 5 done + * + * This strategy works for dRAID but is less effecient when there are a large + * number of child vdevs and therefore permutations to check. Furthermore, + * since the raidz_map_t rows likely do not overlap reconstruction would be + * possible as long as there are no more than nparity data errors per row. + * These additional permutations are not currently checked but could be as + * a future improvement. + */ +static int +vdev_raidz_combrec(zio_t *zio) +{ + int nparity = vdev_get_nparity(zio->io_vd); + raidz_map_t *rm = zio->io_vsd; - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } + /* Check if there's enough data to attempt reconstrution. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + int total_errors = 0; - ret = code; - goto done; - } + for (int c = 0; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_error) + total_errors++; + } - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy(rc->rc_abd, orig[i], rc->rc_size); - } + if (total_errors > nparity) + return (vdev_raidz_worst_error(rr)); + } - do { + for (int num_failures = 1; num_failures <= nparity; num_failures++) { + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *ltgts = &tstore[1]; /* value is logical child ID */ + + /* Determine number of logical children, n */ + int n = zio->io_vd->vdev_children; + + ASSERT3U(num_failures, <=, nparity); + ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); + + /* Handle corner cases in combrec logic */ + ltgts[-1] = -1; + for (int i = 0; i < num_failures; i++) { + ltgts[i] = i; + } + ltgts[num_failures] = n; + + for (;;) { + int err = raidz_reconstruct(zio, ltgts, num_failures, + nparity); + if (err == EINVAL) { /* - * Find the next valid column after the curr - * position.. + * Reconstruction not possible with this # + * failures; try more failures. */ - for (next = tgts[curr] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; + break; + } else if (err == 0) + return (0); + + /* Compute next targets to try */ + for (int t = 0; ; t++) { + ASSERT3U(t, <, num_failures); + ltgts[t]++; + if (ltgts[t] == n) { + /* try more failures */ + ASSERT3U(t, ==, num_failures - 1); + break; + } - ASSERT(next <= tgts[curr + 1]); + ASSERT3U(ltgts[t], <, n); + ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. + * Try the next combination. */ - if (next != tgts[curr + 1]) + if (ltgts[t] != ltgts[t + 1]) break; /* - * Otherwise, find the next valid column after - * the previous position. + * Otherwise, reset this tgt to the minimum, + * and move on to the next tgt. */ - for (c = tgts[curr - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[curr] = c; - curr++; + ltgts[t] = ltgts[t - 1] + 1; + ASSERT3U(ltgts[t], ==, t); + } - } while (curr != n); + /* Increase the number of failures and keep trying. */ + if (ltgts[num_failures - 1] == n) + break; } } - n--; -done: - for (i = 0; i < n; i++) - abd_free(orig[i]); - return (ret); + return (ECKSUM); +} + +void +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +{ + for (uint64_t row = 0; row < rm->rm_nrows; row++) { + raidz_row_t *rr = rm->rm_row[row]; + vdev_raidz_reconstruct_row(rm, rr, t, nt); + } } /* - * Complete an IO operation on a RAIDZ VDev + * Complete a write IO operation on a RAIDZ VDev * * Outline: - * - For write operations: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. */ static void -vdev_raidz_io_done(zio_t *zio) +vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) +{ + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + total_errors++; + } + } + + /* + * Treat partial writes as a success. If we couldn't write enough + * columns to reconstruct the data, the I/O failed. Otherwise, + * good enough. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + if (total_errors > rr->rr_firstdatacol) { + zio->io_error = zio_worst_error(zio->io_error, + vdev_raidz_worst_error(rr)); + } +} + +/* + * return 0 if no reconstruction occurred, otherwise the "code" from + * vdev_raidz_reconstruct(). + */ +static int +vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, + raidz_row_t *rr) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc = NULL; - int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + int code = 0; - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - if (c < rm->rm_firstdatacol) + if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; - if (!rc->rc_skipped) - unexpected_errors++; - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as a success. - * (If we couldn't write enough columns to reconstruct - * the data, the I/O failed. Otherwise, good enough.) - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. - */ - /* XXPOLICY */ - if (total_errors > rm->rm_firstdatacol) - zio->io_error = vdev_raidz_worst_error(rm); - - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction - * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. + * If there were data errors and the number of errors we saw was + * correctable -- less than or equal to the number of parity disks read + * -- reconstruct based on the missing data. */ + if (data_errors != 0 && + total_errors <= rr->rr_firstdatacol - parity_untried) { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rr->rr_firstdatacol); - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - if (data_errors == 0) { - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; + /* + * Identify the data columns that reported an error. + */ + int n = 0; + int tgts[VDEV_RAIDZ_MAXPARITY]; + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; } - } else { - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); + } - /* - * Identify the data columns that reported an error. - */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } + ASSERT(rr->rr_firstdatacol >= n); - ASSERT(rm->rm_firstdatacol >= n); + code = vdev_raidz_reconstruct_row(rm, rr, tgts, n); + } - code = vdev_raidz_reconstruct(rm, tgts, n); + return (code); +} - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. - */ - if (parity_errors < rm->rm_firstdatacol - n || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } +/* + * Return the number of reads issued. + */ +static int +vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + int nread = 0; - goto done; - } - } - } + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. + * If this rows contains empty sectors which are not required + * for a normal read then allocate an ABD for them now so they + * may be read, verified, and any needed repairs performed. */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; + if (rr->rr_nempty && rr->rr_abd_empty == NULL) + vdev_draid_map_alloc_empty(zio, rr); - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) continue; - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); - - return; + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + nread++; } + return (nread); +} - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. - */ - if (total_errors > rm->rm_firstdatacol) { - zio->io_error = vdev_raidz_worst_error(rm); +/* + * We're here because either there were too many errors to even attempt + * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() + * failed. In either case, there is enough bad data to prevent reconstruction. + * Start checksum ereports for all children which haven't failed. + */ +static void +vdev_raidz_io_done_unrecoverable(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { - /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. - */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(zio, rm); - } else { - /* - * We're here because either: - * - * total_errors == rm_first_datacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. - */ - zio->io_error = SET_ERROR(ECKSUM); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - vdev_t *cvd; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error != 0) - continue; + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = rm->rm_ecksuminjected; - - int ret = zfs_ereport_start_checksum( - zio->io_spa, cvd, &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); - if (ret != EALREADY) { - mutex_enter(&cvd->vdev_stat_lock); - cvd->vdev_stat.vs_checksum_errors++; - mutex_exit(&cvd->vdev_stat_lock); - } + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error != 0) + continue; + + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum(zio->io_spa, + cvd, &zio->io_bookmark, zio, rc->rc_offset, + rc->rc_size, (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { + mutex_enter(&cvd->vdev_stat_lock); + cvd->vdev_stat.vs_checksum_errors++; + mutex_exit(&cvd->vdev_stat_lock); } } } +} -done: - zio_checksum_verified(zio); +void +vdev_raidz_io_done(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; - if (zio->io_error == 0 && spa_writeable(zio->io_spa) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { - /* - * Use the good data we have in hand to repair damaged children. - */ - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); + } + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + rr->rr_code = + vdev_raidz_io_done_reconstruct_known_missing(zio, + rm, rr); + } - if (rc->rc_error == 0) - continue; + if (raidz_checksum_verify(zio) == 0) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_done_verified(zio, rr); + } + zio_checksum_verified(zio); + } else { + /* + * A sequential resilver has no checksum which makes + * combinatoral reconstruction impossible. This code + * path is unreachable since raidz_checksum_verify() + * has no checksum to verify and must succeed. + */ + ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? - ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + /* + * This isn't a typical situation -- either we got a + * read error or a child silently returned bad data. + * Read every block so we can try again with as much + * data and parity as we can track down. If we've + * already been through once before, all children will + * be marked as tried so we'll proceed to combinatorial + * reconstruction. + */ + int nread = 0; + for (int i = 0; i < rm->rm_nrows; i++) { + nread += vdev_raidz_read_all(zio, + rm->rm_row[i]); + } + if (nread != 0) { + /* + * Normally our stage is VDEV_IO_DONE, but if + * we've already called redone(), it will have + * changed to VDEV_IO_START, in which case we + * don't want to call redone() again. + */ + if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_redone(zio); + return; + } + + zio->io_error = vdev_raidz_combrec(zio); + if (zio->io_error == ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + vdev_raidz_io_done_unrecoverable(zio); + } } } } @@ -2327,7 +2564,8 @@ done: static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { - if (faulted > vd->vdev_nparity) + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) @@ -2343,18 +2581,26 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) * width blocks must be resilvered. */ static boolean_t -vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t dcols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> ashift; + uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; + /* Unreachable by sequential resilver. */ + ASSERT3U(phys_birth, !=, TXG_UNKNOWN); + + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + if (s + nparity >= dcols) return (B_TRUE); @@ -2375,7 +2621,8 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); @@ -2385,10 +2632,10 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ - ASSERT0(in->rs_start % (1 << ashift)); - ASSERT0(in->rs_end % (1 << ashift)); - uint64_t b_start = in->rs_start >> ashift; - uint64_t b_end = in->rs_end >> ashift; + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + uint64_t b_start = logical_rs->rs_start >> ashift; + uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ @@ -2398,17 +2645,119 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; - res->rs_start = start_row << ashift; - res->rs_end = end_row << ashift; + physical_rs->rs_start = start_row << ashift; + physical_rs->rs_end = end_row << ashift; - ASSERT3U(res->rs_start, <=, in->rs_start); - ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); + ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_rs->rs_end - logical_rs->rs_start); +} + +/* + * Initialize private RAIDZ specific fields from the nvlist. + */ +static int +vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_raidz_t *vdrz; + uint64_t nparity; + + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + return (SET_ERROR(EINVAL)); + + /* + * Previous versions could only support 1 or 2 parity + * device. + */ + if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) + return (SET_ERROR(EINVAL)); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + + vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vd_logical_width = children; + vdrz->vd_nparity = nparity; + + *tsd = vdrz; + + return (0); +} + +static void +vdev_raidz_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); +} + +/* + * Add RAIDZ specific fields to the config nvlist. + */ +static void +vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); + vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vdrz->vd_nparity == 1 || + (vdrz->vd_nparity <= 2 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || + (vdrz->vd_nparity <= 3 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); + + /* + * Note that we'll add these even on storage pools where they + * aren't strictly required -- older software will just ignore + * it. + */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_nparity(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + return (vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_ndisks(vdev_t *vd) +{ + return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { + .vdev_op_init = vdev_raidz_init, + .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, + .vdev_op_min_asize = vdev_raidz_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, @@ -2417,6 +2766,11 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_raidz_config_generate, + .vdev_op_nparity = vdev_raidz_nparity, + .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 9595a7b95..a8eca06f9 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void) * Select parity generation method for raidz_map */ int -vdev_raidz_math_generate(raidz_map_t *rm) +vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr) { raidz_gen_f gen_parity = NULL; @@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm) if (gen_parity == NULL) return (RAIDZ_ORIGINAL_IMPL); - gen_parity(rm); + gen_parity(rr); return (0); } @@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, * @nbaddata - Number of failed data columns */ int -vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, - const int *dt, const int nbaddata) +vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr, + const int *parity_valid, const int *dt, const int nbaddata) { raidz_rec_f rec_fn = NULL; @@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_fn(rm, dt)); + return (rec_fn(rr, dt)); } const char *raidz_gen_name[] = { diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 89c2082c4..35e016fc6 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -26,6 +26,7 @@ #define _VDEV_RAIDZ_MATH_IMPL_H #include <sys/types.h> +#include <sys/vdev_raidz_impl.h> #define raidz_inline inline __attribute__((always_inline)) #ifndef noinline @@ -36,33 +37,33 @@ * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes * @coeff output array of coefficients. Array must be provided by * user and must hold minimum MUL_CNT values. */ static noinline void -raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); } static noinline void -raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); } static noinline void -raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; @@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; const unsigned z = tgtidx[TARGET_Z]; @@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private) /* * Generate P parity (RAIDZ1) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) +raidz_generate_p_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t psize = rm->rm_col[CODE_P].rc_size; - abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + const size_t ncols = rr->rr_cols; + const size_t psize = rr->rr_col[CODE_P].rc_size; + abd_t *pabd = rr->rr_col[CODE_P].rc_abd; size_t size; abd_t *dabd; raidz_math_begin(); /* start with first data column */ - raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + raidz_copy(pabd, rr->rr_col[1].rc_abd, psize); for (c = 2; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - size = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + size = rr->rr_col[c].rc_size; /* add data column */ raidz_add(pabd, dabd, size); @@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize, /* * Generate PQ parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) +raidz_generate_pq_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize); for (c = 3; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, raidz_gen_pq_add); @@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, /* * Generate PQR parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) +raidz_generate_pqr_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize); for (c = 4; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, raidz_gen_pqr_add); @@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm) * @syn_method raidz_add_abd() * @rec_method not applicable * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; size_t size; abd_t *dabd; + if (xabd == NULL) + return (1 << CODE_P); + raidz_math_begin(); /* copy P into target */ - raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize); /* generate p_syndrome */ for (c = firstdc; c < ncols; c++) { if (c == x) continue; - dabd = rm->rm_col[c].rc_abd; - size = MIN(rm->rm_col[c].rc_size, xsize); + dabd = rr->rr_col[c].rc_abd; + size = MIN(rr->rr_col[c].rc_size, xsize); raidz_add(xabd, dabd, size); } @@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - abd_t *xabd = rm->rm_col[x].rc_abd; - const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_Q); + unsigned coeff[MUL_CNT]; - raidz_rec_q_coeff(rm, tgtidx, coeff); + raidz_rec_q_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) } /* add Q to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); @@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ rr * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_R); + unsigned coeff[MUL_CNT]; - raidz_rec_r_coeff(rm, tgtidx, coeff); + raidz_rec_r_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) } /* add R to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); @@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, * @syn_method raidz_syn_pq_abd() * @rec_method raidz_rec_pq_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q)); + unsigned coeff[MUL_CNT]; - raidz_rec_pq_coeff(rm, tgtidx, coeff); + raidz_rec_pq_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pr_abd() * @rec_method raidz_rec_pr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[0]; const size_t y = tgtidx[1]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + raidz_rec_pr_coeff(rr, tgtidx, coeff); /* * Check if some of targets are shorter then others. @@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); - return ((1 << CODE_P) | (1 << CODE_Q)); + return ((1 << CODE_P) | (1 << CODE_R)); } @@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_qr_abd() * @rec_method raidz_rec_qr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + raidz_rec_qr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pqr_abd() * @rec_method raidz_rec_pqr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t z = tgtidx[TARGET_Z]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - const size_t zsize = rm->rm_col[z].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; - abd_t *zabd = rm->rm_col[z].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + const size_t zsize = rr->rr_col[z].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; + abd_t *zabd = rr->rr_col[z].rc_abd; abd_t *tabds[] = { xabd, yabd, zabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + raidz_rec_pqr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, @@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); if (zsize < xsize) - raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize); raidz_math_end(); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 3362d608c..784d1af15 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -25,6 +25,7 @@ */ #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/dsl_scan.h> #include <sys/spa_impl.h> #include <sys/metaslab_impl.h> @@ -63,13 +64,15 @@ * * Limitations: * - * - Only supported for mirror vdev types. Due to the variable stripe - * width used by raidz sequential reconstruction is not possible. + * - Sequential reconstruction is not possible on RAIDZ due to its + * variable stripe width. Note dRAID uses a fixed stripe width which + * avoids this issue, but comes at the expense of some usable capacity. * - * - Block checksums are not verified during sequential reconstuction. + * - Block checksums are not verified during sequential reconstruction. * Similar to traditional RAID the parity/mirror data is reconstructed * but cannot be immediately double checked. For this reason when the - * last active resilver completes the pool is automatically scrubbed. + * last active resilver completes the pool is automatically scrubbed + * by default. * * - Deferred resilvers using sequential reconstruction are not currently * supported. When adding another vdev to an active top-level resilver @@ -77,8 +80,8 @@ * * Advantages: * - * - Sequential reconstuction is performed in LBA order which may be faster - * than healing reconstuction particularly when using using HDDs (or + * - Sequential reconstruction is performed in LBA order which may be faster + * than healing reconstruction particularly when using using HDDs (or * especially with SMR devices). Only allocated capacity is resilvered. * * - Sequential reconstruction is not constrained by ZFS block boundaries. @@ -86,9 +89,9 @@ * allowing all of these logical blocks to be repaired with a single IO. * * - Unlike a healing resilver or scrub which are pool wide operations, - * sequential reconstruction is handled by the top-level mirror vdevs. - * This allows for it to be started or canceled on a top-level vdev - * without impacting any other top-level vdevs in the pool. + * sequential reconstruction is handled by the top-level vdevs. This + * allows for it to be started or canceled on a top-level vdev without + * impacting any other top-level vdevs in the pool. * * - Data only referenced by a pool checkpoint will be repaired because * that space is reflected in the space maps. This differs for a @@ -97,17 +100,35 @@ /* - * Maximum number of queued rebuild I/Os top-level vdev. The number of - * concurrent rebuild I/Os issued to the device is controlled by the - * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module - * options. + * Size of rebuild reads; defaults to 1MiB per data disk and is capped at + * SPA_MAXBLOCKSIZE. */ -unsigned int zfs_rebuild_queue_limit = 20; +unsigned long zfs_rebuild_max_segment = 1024 * 1024; /* - * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + * Maximum number of parallelly executed bytes per leaf vdev caused by a + * sequential resilver. We attempt to strike a balance here between keeping + * the vdev queues full of I/Os at all times and not overflowing the queues + * to cause long latency, which would cause long txg sync times. + * + * A large default value can be safely used here because the default target + * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep + * the queue depth short. + * + * 32MB was selected as the default value to achieve good performance with + * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential + * rebuild was unable to saturate all of the drives using smaller values. + * With a value of 32MB the sequential resilver write rate was measured at + * 800MB/s sustained while rebuilding to a distributed spare. */ -unsigned long zfs_rebuild_max_segment = 1024 * 1024; +unsigned long zfs_rebuild_vdev_limit = 32 << 20; + +/* + * Automatically start a pool scrub when the last active sequential resilver + * completes in order to verify the checksums of all blocks which have been + * resilvered. This option is enabled by default and is strongly recommended. + */ +int zfs_rebuild_scrub_enabled = 1; /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). @@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); - vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, @@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); - spa_notify_waiters(spa); + /* + * While we're in syncing context take the opportunity to + * setup the scrub when there are no more active rebuilds. + */ + if (!vdev_rebuild_active(spa->spa_root_vdev) && + zfs_rebuild_scrub_enabled) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_setup_sync(&func, tx); + } + cv_broadcast(&vd->vdev_rebuild_cv); } @@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vdev_t *vd = vr->vr_top_vdev; - mutex_enter(&vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the top-level vdev was unavailable. @@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio) abd_free(zio->io_abd); - ASSERT3U(vd->vdev_rebuild_inflight, >, 0); - vd->vdev_rebuild_inflight--; - cv_broadcast(&vd->vdev_rebuild_io_cv); - mutex_exit(&vd->vdev_rebuild_io_lock); + ASSERT3U(vr->vr_bytes_inflight, >, 0); + vr->vr_bytes_inflight -= zio->io_size; + cv_broadcast(&vr->vr_io_cv); + mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* - * Rebuild the data in this range by constructing a special dummy block - * pointer for the given range. It has no relation to any existing blocks - * in the pool. But by disabling checksum verification and issuing a scrub - * I/O mirrored vdevs will replicate the block using any available mirror - * leaf vdevs. + * Initialize a block pointer that can be used to read the given segment + * for sequential rebuild. */ static void -vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, - uint64_t txg) +vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, + uint64_t asize) { - vdev_t *vd = vr->vr_top_vdev; - spa_t *spa = vd->vdev_spa; - uint64_t psize = asize; - - ASSERT(vd->vdev_ops == &vdev_mirror_ops || + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - blkptr_t blk, *bp = &blk; + uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? + vdev_draid_asize_to_psize(vd, asize) : asize; + BP_ZERO(bp); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); @@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - /* - * We increment the issued bytes by the asize rather than the psize - * so the scanned and issued bytes may be directly compared. This - * is consistent with the scrub/resilver issued reporting. - */ - vr->vr_pass_bytes_issued += asize; - vr->vr_rebuild_phys.vrp_bytes_issued += asize; - - zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, - abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, - ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | - ZIO_FLAG_RESILVER, NULL)); } /* @@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; + blkptr_t blk; ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); @@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vr->vr_pass_bytes_scanned += size; vr->vr_rebuild_phys.vrp_bytes_scanned += size; - mutex_enter(&vd->vdev_rebuild_io_lock); + /* + * Rebuild the data in this range by constructing a special block + * pointer. It has no relation to any existing blocks in the pool. + * However, by disabling checksum verification and issuing a scrub IO + * we can reconstruct and repair any children with missing data. + */ + vdev_rebuild_blkptr_init(&blk, vd, start, size); + uint64_t psize = BP_GET_PSIZE(&blk); + + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + return (0); + + mutex_enter(&vr->vr_io_lock); /* Limit in flight rebuild I/Os */ - while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) - cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); - vd->vdev_rebuild_inflight++; - mutex_exit(&vd->vdev_rebuild_io_lock); + vr->vr_bytes_inflight += psize; + mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); @@ -558,46 +584,30 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) /* When exiting write out our progress. */ if (vdev_rebuild_should_stop(vd)) { - mutex_enter(&vd->vdev_rebuild_io_lock); - vd->vdev_rebuild_inflight--; - mutex_exit(&vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); + vr->vr_bytes_inflight -= psize; + mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); vr->vr_scan_offset[txg & TXG_MASK] = start + size; - vdev_rebuild_rebuild_block(vr, start, size, txg); + vr->vr_pass_bytes_issued += size; + vr->vr_rebuild_phys.vrp_bytes_issued += size; - dmu_tx_commit(tx); + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); return (0); } /* - * Split range into legally-sized logical chunks given the constraints of the - * top-level mirror vdev type. - */ -static uint64_t -vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) -{ - uint64_t chunk_size, max_asize, max_segment; - - ASSERT(vd->vdev_ops == &vdev_mirror_ops || - vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - - max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, - 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); - max_asize = vdev_psize_to_asize(vd, max_segment); - chunk_size = MIN(size, max_asize); - - return (chunk_size); -} - -/* * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. */ static int @@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr) while (size > 0) { uint64_t chunk_size; - chunk_size = vdev_rebuild_chunk_size(vd, start, size); + /* + * Split range into legally-sized logical chunks + * given the constraints of the top-level vdev + * being rebuilt (dRAID or mirror). + */ + ASSERT3P(vd->vdev_ops, !=, NULL); + chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, + start, size, zfs_rebuild_max_segment); error = vdev_rebuild_range(vr, start, chunk_size); if (error != 0) @@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg) vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); + vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; + vr->vr_bytes_inflight_max = MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children); + uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg) ASSERT0(range_tree_space(vr->vr_scan_tree)); - /* - * Disable any new allocations to this metaslab and wait - * for any writes inflight to complete. This is needed to - * ensure all allocated ranges are rebuilt. - */ + /* Disable any new allocations to this metaslab */ metaslab_disable(msp); spa_config_exit(spa, SCL_CONFIG, FTAG); - txg_wait_synced(dsl, 0); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* + * If there are outstanding allocations wait for them to be + * synced. This is needed to ensure all allocated ranges are + * on disk and therefore will be rebuilt. + */ + for (int j = 0; j < TXG_SIZE; j++) { + if (range_tree_space(msp->ms_allocating[j])) { + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + txg_wait_synced(dsl, 0); + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + break; + } + } + + /* * When a metaslab has been allocated from read its allocated - * ranges from the space map object in to the vr_scan_tree. + * ranges from the space map object into the vr_scan_tree. * Then add inflight / unflushed ranges and remove inflight / * unflushed frees. This is the minimum range to be rebuilt. */ @@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg) /* * To provide an accurate estimate re-calculate the estimated * size every 5 minutes to account for recent allocations and - * frees made space maps which have not yet been rebuilt. + * frees made to space maps which have not yet been rebuilt. */ if (gethrtime() > update_est_time + SEC2NSEC(300)) { update_est_time = gethrtime(); @@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ - mutex_enter(&vd->vdev_rebuild_io_lock); - while (vd->vdev_rebuild_inflight > 0) - cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); + while (vr->vr_bytes_inflight > 0) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); - mutex_exit(&vd->vdev_rebuild_io_lock); + mutex_exit(&vr->vr_io_lock); + + mutex_destroy(&vr->vr_io_lock); + cv_destroy(&vr->vr_io_cv); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, - "Max segment size in bytes of rebuild reads"); + "Max segment size in bytes of rebuild reads"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, + "Max bytes in flight per leaf vdev for sequential resilvers"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, + "Automatically scrub after sequential resilver completes"); /* END CSTYLED */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index ed7d1d4b3..4606af9aa 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) spa_vdev_removal_t *svr = NULL; uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); @@ -1120,7 +1120,7 @@ static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); if (vd->vdev_leaf_zap != 0) { char zkey[32]; @@ -2041,7 +2041,7 @@ spa_vdev_remove_top_check(vdev_t *vd) /* * All vdevs in normal class must have the same ashift - * and not be raidz. + * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; @@ -2064,7 +2064,7 @@ spa_vdev_remove_top_check(vdev_t *vd) num_indirect++; if (!vdev_is_concrete(cvd)) continue; - if (cvd->vdev_ops == &vdev_raidz_ops) + if (vdev_get_nparity(cvd) != 0) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only @@ -2217,18 +2217,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - - vd_type = VDEV_TYPE_SPARE; - vd_path = spa_strdup(fnvlist_lookup_string( - nv, ZPOOL_CONFIG_PATH)); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; + char *type; + boolean_t draid_spare = B_FALSE; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) + == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + draid_spare = B_TRUE; + + if (vd == NULL && draid_spare) { + error = SET_ERROR(ENOTSUP); + } else { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, + guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + vd_type = VDEV_TYPE_SPARE; + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } } else { error = SET_ERROR(EBUSY); } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 9e8aac7d0..45ddc2f71 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_root_open, .vdev_op_close = vdev_root_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ .vdev_op_io_done = NULL, /* not applicable to the root */ .vdev_op_state_change = vdev_root_state_change, @@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 02b42ddd5..895957bda 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_secure = secure; } - boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); + vdev_trim_state_t old_state = vd->vdev_trim_state; + boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED); vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, "vdev=%s suspended", vd->vdev_path); break; case VDEV_TRIM_CANCELED: - spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); - spa_history_log_internal(spa, "trim", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_TRIM_ACTIVE || + old_state == VDEV_TRIM_SUSPENDED) { + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s canceled", vd->vdev_path); + } break; case VDEV_TRIM_COMPLETE: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); @@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta) return (0); } +static void +vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_trim_bytes_est += size; + + if (vd->vdev_trim_last_offset >= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += size; + } else if (vd->vdev_trim_last_offset > physical_rs->rs_start && + vd->vdev_trim_last_offset <= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += + vd->vdev_trim_last_offset - physical_rs->rs_start; + } +} + /* * Calculates the completion percentage of a manual TRIM. */ @@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been trimmed. */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been trimmed */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_trim_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_trim_last_offset > last_rs_end) { vd->vdev_trim_bytes_done += ms_free; vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd) rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_trim_bytes_est += size; - if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { - vd->vdev_trim_bytes_done += size; - } else if (vd->vdev_trim_last_offset > - physical_rs.rs_start && - vd->vdev_trim_last_offset <= - physical_rs.rs_end) { - vd->vdev_trim_bytes_done += - vd->vdev_trim_last_offset - - physical_rs.rs_start; - } + + vdev_xlate_walk(vd, &logical_rs, + vdev_trim_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd) return (err); } +static void +vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + trim_args_t *ta = arg; + vdev_t *vd = ta->trim_vdev; + + /* + * Only a manual trim will be traversing the vdev sequentially. + * For an auto trim all valid ranges should be added. + */ + if (ta->trim_type == TRIM_TYPE_MANUAL) { + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_trim_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_trim_last_offset > physical_rs->rs_start) { + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_trim_last_offset); + physical_rs->rs_start = vd->vdev_trim_last_offset; + } + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(ta->trim_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* - * Convert the logical range into a physical range and add it to the + * Convert the logical range into physical ranges and add them to the * range tree passed in the trim_args_t. */ static void @@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) } ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* - * Only a manual trim will be traversing the vdev sequentially. - * For an auto trim all valid ranges should be added. - */ - if (ta->trim_type == TRIM_TYPE_MANUAL) { - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_trim_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_trim_last_offset > physical_rs.rs_start) { - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_trim_last_offset); - physical_rs.rs_start = vd->vdev_trim_last_offset; - } - } - - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(ta->trim_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg); } /* diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index a8341f50b..ea71ef325 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, bcopy(info, report->zcr_ckinfo, sizeof (*info)); } - report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_align = + vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); report->zcr_length = length; #ifdef _KERNEL diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ccba6cea3..982940dbd 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1702,16 +1702,16 @@ zio_write_compress(zio_t *zio) return (zio); } else { /* - * Round up compressed size up to the ashift - * of the smallest-ashift device, and zero the tail. - * This ensures that the compressed size of the BP - * (and thus compressratio property) are correct, + * Round compressed size up to the minimum allocation + * size of the smallest-ashift device, and zero the + * tail. This ensures that the compressed size of the + * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)P2ROUNDUP(psize, - 1ULL << spa->spa_min_ashift); + ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); + size_t rounded = (size_t)roundup(psize, + spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -3754,19 +3754,37 @@ zio_vdev_io_start(zio_t *zio) * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } - if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + /* + * Select the next best leaf I/O to process. Distributed spares are + * excluded since they dispatch the I/O directly to a leaf vdev after + * applying the dRAID mapping. + */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && + (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); @@ -3803,8 +3821,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -4206,7 +4224,7 @@ zio_checksum_verify(zio_t *zio) if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index fb8ce0916..e56ea8868 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error) if (zio->io_type != ZIO_TYPE_READ) return (0); + /* + * A rebuild I/O has no checksum to verify. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) + return (0); + rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; |