diff options
author | Brian Behlendorf <[email protected]> | 2020-11-13 13:51:51 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-11-13 13:51:51 -0800 |
commit | b2255edcc0099e62ad46a3dd9d64537663c6aee3 (patch) | |
tree | 6cfe0d0fd30fb451396551a991d50f4bdc0cf353 /include/sys | |
parent | a724db03740133c46b9a577b41a6f7221acd3e1f (diff) |
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Co-authored-by: Matthew Ahrens <[email protected]>
Co-authored-by: Brian Behlendorf <[email protected]>
Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10102
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/Makefile.am | 1 | ||||
-rw-r--r-- | include/sys/dsl_scan.h | 1 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 14 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 5 | ||||
-rw-r--r-- | include/sys/txg.h | 1 | ||||
-rw-r--r-- | include/sys/vdev.h | 16 | ||||
-rw-r--r-- | include/sys/vdev_draid.h | 110 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 52 | ||||
-rw-r--r-- | include/sys/vdev_raidz.h | 17 | ||||
-rw-r--r-- | include/sys/vdev_raidz_impl.h | 61 | ||||
-rw-r--r-- | include/sys/vdev_rebuild.h | 4 | ||||
-rw-r--r-- | include/sys/zio.h | 1 |
12 files changed, 237 insertions, 46 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index cfc3d1018..c3ebf17b5 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -82,6 +82,7 @@ COMMON_H = \ vdev_disk.h \ vdev_file.h \ vdev.h \ + vdev_draid.h \ vdev_impl.h \ vdev_indirect_births.h \ vdev_indirect_mapping.h \ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 8f929207d..19c3dd599 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -163,6 +163,7 @@ typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; void scan_init(void); void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_setup_sync(void *, dmu_tx_t *); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 211dd6d50..5bb7971d4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -617,6 +617,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" @@ -757,10 +758,17 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" +/* dRAID configuration */ +#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" +#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" +#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" + #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -770,6 +778,12 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_DRAID_MAXPARITY 3 +#define VDEV_DRAID_MIN_CHILDREN 2 +#define VDEV_DRAID_MAX_CHILDREN UINT8_MAX + /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 69de75fb6..93f49a311 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -240,8 +240,9 @@ struct spa { kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ - int spa_min_ashift; /* of vdevs in normal class */ - int spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_ashift; /* of vdevs in normal class */ + uint64_t spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ diff --git a/include/sys/txg.h b/include/sys/txg.h index 260a3b43c..22158bd1a 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -41,6 +41,7 @@ extern "C" { #define TXG_MASK (TXG_SIZE - 1) /* mask for size */ #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +#define TXG_UNKNOWN 0 /* Number of txgs worth of frees we defer adding to in-core spacemaps */ #define TXG_DEFER_SIZE 2 diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 309ce33be..7bc72a03d 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -49,10 +49,13 @@ typedef enum vdev_dtl_type { extern int zfs_nocacheflush; +typedef boolean_t vdev_open_children_func_t(vdev_t *vd); + extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); +extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *); extern int vdev_validate(vdev_t *); extern int vdev_copy_path_strict(vdev_t *, vdev_t *); extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); @@ -71,7 +74,10 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); -extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); +extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); @@ -97,8 +103,14 @@ extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); + +typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs); + +extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs); extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs); + range_seg64_t *physical_rs, range_seg64_t *remain_rs); +extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h new file mode 100644 index 000000000..65417a93c --- /dev/null +++ b/include/sys/vdev_draid.h @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_DRAID_H +#define _SYS_VDEV_DRAID_H + +#include <sys/types.h> +#include <sys/abd.h> +#include <sys/nvpair.h> +#include <sys/zio.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_raidz_impl.h> +#include <sys/vdev.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Constants required to generate and use dRAID permutations. + */ +#define VDEV_DRAID_SEED 0xd7a1d5eed +#define VDEV_DRAID_MAX_MAPS 254 +#define VDEV_DRAID_ROWSHIFT SPA_MAXBLOCKSHIFT +#define VDEV_DRAID_ROWHEIGHT (1ULL << VDEV_DRAID_ROWSHIFT) +#define VDEV_DRAID_REFLOW_RESERVE (2 * VDEV_DRAID_ROWHEIGHT) + +/* + * dRAID permutation map. + */ +typedef struct draid_map { + uint64_t dm_children; /* # of permuation columns */ + uint64_t dm_nperms; /* # of permutation rows */ + uint64_t dm_seed; /* dRAID map seed */ + uint64_t dm_checksum; /* Checksum of generated map */ + uint8_t *dm_perms; /* base permutation array */ +} draid_map_t; + +/* + * dRAID configuration. + */ +typedef struct vdev_draid_config { + /* + * Values read from the dRAID nvlist configuration. + */ + uint64_t vdc_ndata; /* # of data devices in group */ + uint64_t vdc_nparity; /* # of parity devices in group */ + uint64_t vdc_nspares; /* # of distributed spares */ + uint64_t vdc_children; /* # of children */ + uint64_t vdc_ngroups; /* # groups per slice */ + + /* + * Immutable derived constants. + */ + uint8_t *vdc_perms; /* permutation array */ + uint64_t vdc_nperms; /* # of permutations */ + uint64_t vdc_groupwidth; /* = data + parity */ + uint64_t vdc_ndisks; /* = children - spares */ + uint64_t vdc_groupsz; /* = groupwidth * DRAID_ROWSIZE */ + uint64_t vdc_devslicesz; /* = (groupsz * groups) / ndisks */ +} vdev_draid_config_t; + +/* + * Functions for handling dRAID permutation maps. + */ +extern uint64_t vdev_draid_rand(uint64_t *); +extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **); +extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); + +/* + * General dRAID support functions. + */ +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); +extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); + +/* Functions for dRAID distributed spares. */ +extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DRAID_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3c4c3fb5a..7d2b2743c 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -68,14 +68,19 @@ extern uint32_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ +typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); +typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); +typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); -typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -87,13 +92,24 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, - range_seg64_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, + range_seg64_t *physical, range_seg64_t *remain); +typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, + uint64_t size, uint64_t max_segment); +typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, + uint64_t *sizep); +typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); +typedef uint64_t vdev_nparity_func_t(vdev_t *vd); +typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { + vdev_init_func_t *vdev_op_init; + vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; + vdev_min_asize_func_t *vdev_op_min_asize; + vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; @@ -101,11 +117,12 @@ typedef const struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; - /* - * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. - * Used when initializing vdevs. Isn't used by leaf ops. - */ vdev_xlation_func_t *vdev_op_xlate; + vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; + vdev_metaslab_init_func_t *vdev_op_metaslab_init; + vdev_config_generate_func_t *vdev_op_config_generate; + vdev_nparity_func_t *vdev_op_nparity; + vdev_ndisks_func_t *vdev_op_ndisks; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -325,16 +342,13 @@ struct vdev { kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; - /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ + /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; - kmutex_t vdev_rebuild_io_lock; - kcondvar_t vdev_rebuild_io_cv; - uint64_t vdev_rebuild_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -392,7 +406,6 @@ struct vdev { uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ @@ -445,8 +458,6 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; }; -#define VDEV_RAIDZ_MAXPARITY 3 - #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 @@ -532,6 +543,9 @@ typedef struct vdev_label { #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS +#define VDEV_OFFSET_IS_LABEL(vd, off) \ + (((off) < VDEV_LABEL_START_SIZE) || \ + ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 @@ -577,6 +591,8 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; @@ -587,11 +603,15 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, - range_seg64_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); +extern uint64_t vdev_get_min_alloc(vdev_t *vd); +extern uint64_t vdev_get_nparity(vdev_t *vd); +extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 0ce2b5ea1..029fdef5f 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -32,6 +32,7 @@ extern "C" { #endif struct zio; +struct raidz_row; struct raidz_map; #if !defined(_KERNEL) struct kernel_param {}; @@ -43,8 +44,11 @@ struct kernel_param {}; struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); -int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_child_done(zio_t *); +void vdev_raidz_io_done(zio_t *); /* * vdev_raidz_math interface @@ -52,11 +56,16 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_math_init(void); void vdev_raidz_math_fini(void); const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); -int vdev_raidz_math_generate(struct raidz_map *); -int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, - const int); +int vdev_raidz_math_generate(struct raidz_map *, struct raidz_row *); +int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, + const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz { + int vd_logical_width; + int vd_nparity; +} vdev_raidz_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 8492daedb..38d4f9e0b 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -29,6 +29,7 @@ #include <sys/debug.h> #include <sys/kstat.h> #include <sys/abd.h> +#include <sys/vdev_impl.h> #ifdef __cplusplus extern "C" { @@ -106,30 +107,45 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ + void *rc_orig_data; /* pre-reconstruction */ + abd_t *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ + uint8_t rc_repair; /* Write good data to this column */ } raidz_col_t; +typedef struct raidz_row { + uint64_t rr_cols; /* Regular column count */ + uint64_t rr_scols; /* Count including skipped columns */ + uint64_t rr_bigcols; /* Remainder data column count */ + uint64_t rr_missingdata; /* Count of missing data devices */ + uint64_t rr_missingparity; /* Count of missing parity devices */ + uint64_t rr_firstdatacol; /* First data column/parity count */ + abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */ + abd_t *rr_abd_empty; /* dRAID empty sector buffer */ + int rr_nempty; /* empty sectors included in parity */ + int rr_code; /* reconstruction code (unused) */ +#ifdef ZFS_DEBUG + uint64_t rr_offset; /* Logical offset for *_io_verify() */ + uint64_t rr_size; /* Physical size for *_io_verify() */ +#endif + raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ +} raidz_row_t; + typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ + boolean_t rm_freed; /* map no longer has referencing ZIO */ + boolean_t rm_ecksuminjected; /* checksum error was injected */ + int rm_nrows; /* Regular row count */ + int rm_nskip; /* RAIDZ sectors skipped for padding */ + int rm_skipstart; /* Column index of padding start */ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ + raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; @@ -163,14 +179,15 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; * * raidz_parity Returns parity of the RAIDZ block * raidz_ncols Returns number of columns the block spans + * Note, all rows have the same number of columns. * raidz_nbigcols Returns number of big columns * raidz_col_p Returns pointer to a column * raidz_col_size Returns size of a column * raidz_big_size Returns size of big columns * raidz_short_size Returns size of short columns */ -#define raidz_parity(rm) ((rm)->rm_firstdatacol) -#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols) #define raidz_nbigcols(rm) ((rm)->rm_bigcols) #define raidz_col_p(rm, c) ((rm)->rm_col + (c)) #define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) @@ -185,10 +202,10 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; */ #define _RAIDZ_GEN_WRAP(code, impl) \ static void \ -impl ## _gen_ ## code(void *rmp) \ +impl ## _gen_ ## code(void *rrp) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - raidz_generate_## code ## _impl(rm); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + raidz_generate_## code ## _impl(rr); \ } /* @@ -199,10 +216,10 @@ impl ## _gen_ ## code(void *rmp) \ */ #define _RAIDZ_REC_WRAP(code, impl) \ static int \ -impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +impl ## _rec_ ## code(void *rrp, const int *tgtidx) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \ } /* diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index 3d4b8cc46..61ae15c5d 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -66,10 +66,14 @@ typedef struct vdev_rebuild { vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + kmutex_t vr_io_lock; /* inflight IO lock */ + kcondvar_t vr_io_cv; /* inflight IO cv */ /* In-core state and progress */ uint64_t vr_scan_offset[TXG_SIZE]; uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + uint64_t vr_bytes_inflight_max; /* maximum bytes inflight */ + uint64_t vr_bytes_inflight; /* current bytes inflight */ /* Per-rebuild pass statistics for calculating bandwidth */ uint64_t vr_pass_start_time; diff --git a/include/sys/zio.h b/include/sys/zio.h index 495983171..334ca064b 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -372,6 +372,7 @@ struct zio_cksum_report { nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; |