diff options
author | Brian Behlendorf <[email protected]> | 2020-07-03 11:05:50 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-07-03 11:05:50 -0700 |
commit | 9a49d3f3d3bfa26df4e5e54d574cb490f0ee284b (patch) | |
tree | 715c2fa00e55762764cadef8460da09f919910ad /include | |
parent | 7ddb753d17f2c12f152647c0e34eb9c42ee5e4af (diff) |
Add device rebuild feature
The device_rebuild feature enables sequential reconstruction when
resilvering. Mirror vdevs can be rebuilt in LBA order which may
more quickly restore redundancy depending on the pools average block
size, overall fragmentation and the performance characteristics
of the devices. However, block checksums cannot be verified
as part of the rebuild thus a scrub is automatically started after
the sequential resilver completes.
The new '-s' option has been added to the `zpool attach` and
`zpool replace` command to request sequential reconstruction
instead of healing reconstruction when resilvering.
zpool attach -s <pool> <existing vdev> <new vdev>
zpool replace -s <pool> <old vdev> <new vdev>
The `zpool status` output has been updated to report the progress
of sequential resilvering in the same way as healing resilvering.
The one notable difference is that multiple sequential resilvers
may be in progress as long as they're operating on different
top-level vdevs.
The `zpool wait -t resilver` command was extended to wait on
sequential resilvers. From this perspective they are no different
than healing resilvers.
Sequential resilvers cannot be supported for RAIDZ, but are
compatible with the dRAID feature being developed.
As part of this change the resilver_restart_* tests were moved
in to the functional/replacement directory. Additionally, the
replacement tests were renamed and extended to verify both
resilvering and rebuilding.
Original-patch-by: Isaac Huang <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: John Poduska <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10349
Diffstat (limited to 'include')
-rw-r--r-- | include/libzfs.h | 7 | ||||
-rw-r--r-- | include/sys/Makefile.am | 1 | ||||
-rw-r--r-- | include/sys/dsl_scan.h | 2 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 38 | ||||
-rw-r--r-- | include/sys/spa.h | 10 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 1 | ||||
-rw-r--r-- | include/sys/vdev.h | 2 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 17 | ||||
-rw-r--r-- | include/sys/vdev_rebuild.h | 97 | ||||
-rw-r--r-- | include/sys/zio_priority.h | 1 | ||||
-rw-r--r-- | include/zfeature_common.h | 1 |
11 files changed, 165 insertions, 12 deletions
diff --git a/include/libzfs.h b/include/libzfs.h index 64a0a2035..873e8f304 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -79,7 +79,7 @@ typedef enum zfs_error { EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ - EZFS_RESILVERING, /* currently resilvering */ + EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ @@ -148,6 +148,7 @@ typedef enum zfs_error { EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ + EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_UNKNOWN } zfs_error_t; @@ -297,7 +298,7 @@ extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); extern int zpool_vdev_attach(zpool_handle_t *, const char *, - const char *, nvlist_t *, int); + const char *, nvlist_t *, int, boolean_t); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); extern int zpool_vdev_remove_cancel(zpool_handle_t *); @@ -387,6 +388,8 @@ typedef enum { ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ + ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ /* * Finally, the following indicates a healthy pool. diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index ce781aa4c..0659c6419 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -89,6 +89,7 @@ COMMON_H = \ vdev_initialize.h \ vdev_raidz.h \ vdev_raidz_impl.h \ + vdev_rebuild.h \ vdev_removal.h \ vdev_trim.h \ xvattr.h \ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index bcb896da3..8f929207d 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -42,6 +42,8 @@ struct dsl_dataset; struct dsl_pool; struct dmu_tx; +extern int zfs_scan_suspend_progress; + /* * All members of this structure must be uint64_t, for byteswap * purposes. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 575a4af51..1bfd7a485 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -704,6 +704,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" +#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ @@ -730,6 +731,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ +#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" /* * The persistent vdev state is stored as separate values rather than a single @@ -778,6 +780,9 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" +#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ + "org.openzfs:vdev_rebuild" + #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" @@ -991,6 +996,21 @@ typedef enum dsl_scan_state { DSS_NUM_STATES } dsl_scan_state_t; +typedef struct vdev_rebuild_stat { + uint64_t vrs_state; /* vdev_rebuild_state_t */ + uint64_t vrs_start_time; /* time_t */ + uint64_t vrs_end_time; /* time_t */ + uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ + uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ + uint64_t vrs_bytes_issued; /* read bytes issued */ + uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrs_bytes_est; /* total bytes to scan */ + uint64_t vrs_errors; /* scanning errors */ + uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ + uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ + uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ +} vdev_rebuild_stat_t; + /* * Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER. The ordering * of this enum must be maintained to ensure the errata identifiers map to @@ -1047,6 +1067,7 @@ typedef struct vdev_stat { uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ + uint64_t vs_rebuild_processed; /* bytes rebuilt */ } vdev_stat_t; /* @@ -1178,6 +1199,13 @@ typedef enum { VDEV_TRIM_COMPLETE, } vdev_trim_state_t; +typedef enum { + VDEV_REBUILD_NONE, + VDEV_REBUILD_ACTIVE, + VDEV_REBUILD_CANCELED, + VDEV_REBUILD_COMPLETE, +} vdev_rebuild_state_t; + /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl @@ -1337,6 +1365,8 @@ typedef enum { ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, + ZFS_ERR_RESILVER_IN_PROGRESS, + ZFS_ERR_REBUILD_IN_PROGRESS, } zfs_errno_t; /* @@ -1478,7 +1508,12 @@ typedef enum { * given payloads: * * ESC_ZFS_RESILVER_START - * ESC_ZFS_RESILVER_END + * ESC_ZFS_RESILVER_FINISH + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING + * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * @@ -1532,6 +1567,7 @@ typedef enum { #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" +#define ZFS_EV_RESILVER_TYPE "resilver_type" #ifdef __cplusplus } diff --git a/include/sys/spa.h b/include/sys/spa.h index 5806dda41..9b96eb1f8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -790,17 +790,12 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 - -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 +#define SPA_ASYNC_REBUILD_DONE 0x2000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); + int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); @@ -988,6 +983,7 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 6481d5397..2c52cb666 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -36,6 +36,7 @@ #include <sys/spa_checkpoint.h> #include <sys/spa_log_spacemap.h> #include <sys/vdev.h> +#include <sys/vdev_rebuild.h> #include <sys/vdev_removal.h> #include <sys/metaslab.h> #include <sys/dmu.h> diff --git a/include/sys/vdev.h b/include/sys/vdev.h index d93ef78f1..a7e880636 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -73,7 +73,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - int scrub_done); + boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 56407a191..b9298c62d 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -38,6 +38,7 @@ #include <sys/uberblock_impl.h> #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> +#include <sys/vdev_rebuild.h> #include <sys/vdev_removal.h> #include <sys/zfs_ratelimit.h> @@ -295,13 +296,26 @@ struct vdev { uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ - /* for limiting outstanding I/Os (initialize and TRIM) */ + /* Rebuild related */ + boolean_t vdev_rebuilding; + boolean_t vdev_rebuild_exit_wanted; + boolean_t vdev_rebuild_cancel_wanted; + boolean_t vdev_rebuild_reset_wanted; + kmutex_t vdev_rebuild_lock; + kcondvar_t vdev_rebuild_cv; + kthread_t *vdev_rebuild_thread; + vdev_rebuild_t vdev_rebuild_config; + + /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; + kmutex_t vdev_rebuild_io_lock; + kcondvar_t vdev_rebuild_io_cv; + uint64_t vdev_rebuild_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -358,6 +372,7 @@ struct vdev { uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ + uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h new file mode 100644 index 000000000..3d4b8cc46 --- /dev/null +++ b/include/sys/vdev_rebuild.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_REBUILD_H +#define _SYS_VDEV_REBUILD_H + +#include <sys/spa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Number of entries in the physical vdev_rebuild_phys structure. This + * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS. + */ +#define REBUILD_PHYS_ENTRIES 12 + +/* + * On-disk rebuild configuration and state. When adding new fields they + * must be added to the end of the structure. + */ +typedef struct vdev_rebuild_phys { + uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */ + uint64_t vrp_last_offset; /* last rebuilt offset */ + uint64_t vrp_min_txg; /* minimum missing txg */ + uint64_t vrp_max_txg; /* maximum missing txg */ + uint64_t vrp_start_time; /* start time */ + uint64_t vrp_end_time; /* end time */ + uint64_t vrp_scan_time_ms; /* total run time in ms */ + uint64_t vrp_bytes_scanned; /* alloc bytes scanned */ + uint64_t vrp_bytes_issued; /* read bytes rebuilt */ + uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrp_bytes_est; /* total bytes to scan */ + uint64_t vrp_errors; /* errors during rebuild */ +} vdev_rebuild_phys_t; + +/* + * The vdev_rebuild_t describes the current state and how a top-level vdev + * should be rebuilt. The core elements are the top-vdev, the metaslab being + * rebuilt, range tree containing the allocted extents and the on-disk state. + */ +typedef struct vdev_rebuild { + vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ + metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ + range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + + /* In-core state and progress */ + uint64_t vr_scan_offset[TXG_SIZE]; + uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + + /* Per-rebuild pass statistics for calculating bandwidth */ + uint64_t vr_pass_start_time; + uint64_t vr_pass_bytes_scanned; + uint64_t vr_pass_bytes_issued; + + /* On-disk state updated by vdev_rebuild_zap_update_sync() */ + vdev_rebuild_phys_t vr_rebuild_phys; +} vdev_rebuild_t; + +boolean_t vdev_rebuild_active(vdev_t *); + +int vdev_rebuild_load(vdev_t *); +void vdev_rebuild(vdev_t *); +void vdev_rebuild_stop_wait(vdev_t *); +void vdev_rebuild_stop_all(spa_t *); +void vdev_rebuild_restart(spa_t *); +void vdev_rebuild_clear_sync(void *, dmu_tx_t *); +int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REBUILD_H */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 0b422904e..2d8e7fc36 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -31,6 +31,7 @@ typedef enum zio_priority { ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ + ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2d8767d5b..7e19a62e2 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -74,6 +74,7 @@ typedef enum spa_feature { SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURE_LIVELIST, + SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURES } spa_feature_t; |