diff options
author | Brian Behlendorf <[email protected]> | 2020-07-03 11:05:50 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-07-03 11:05:50 -0700 |
commit | 9a49d3f3d3bfa26df4e5e54d574cb490f0ee284b (patch) | |
tree | 715c2fa00e55762764cadef8460da09f919910ad /module/zfs | |
parent | 7ddb753d17f2c12f152647c0e34eb9c42ee5e4af (diff) |
Add device rebuild feature
The device_rebuild feature enables sequential reconstruction when
resilvering. Mirror vdevs can be rebuilt in LBA order which may
more quickly restore redundancy depending on the pools average block
size, overall fragmentation and the performance characteristics
of the devices. However, block checksums cannot be verified
as part of the rebuild thus a scrub is automatically started after
the sequential resilver completes.
The new '-s' option has been added to the `zpool attach` and
`zpool replace` command to request sequential reconstruction
instead of healing reconstruction when resilvering.
zpool attach -s <pool> <existing vdev> <new vdev>
zpool replace -s <pool> <old vdev> <new vdev>
The `zpool status` output has been updated to report the progress
of sequential resilvering in the same way as healing resilvering.
The one notable difference is that multiple sequential resilvers
may be in progress as long as they're operating on different
top-level vdevs.
The `zpool wait -t resilver` command was extended to wait on
sequential resilvers. From this perspective they are no different
than healing resilvers.
Sequential resilvers cannot be supported for RAIDZ, but are
compatible with the dRAID feature being developed.
As part of this change the resilver_restart_* tests were moved
in to the functional/replacement directory. Additionally, the
replacement tests were renamed and extended to verify both
resilvering and rebuilding.
Original-patch-by: Isaac Huang <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: John Poduska <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #10349
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/Makefile.in | 1 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 44 | ||||
-rw-r--r-- | module/zfs/spa.c | 109 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 29 | ||||
-rw-r--r-- | module/zfs/vdev.c | 238 | ||||
-rw-r--r-- | module/zfs/vdev_label.c | 17 | ||||
-rw-r--r-- | module/zfs/vdev_mirror.c | 5 | ||||
-rw-r--r-- | module/zfs/vdev_queue.c | 18 | ||||
-rw-r--r-- | module/zfs/vdev_rebuild.c | 1106 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 6 |
10 files changed, 1484 insertions, 89 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 7ea976d12..9ddcd6c33 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_rebuild.o $(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += vdev_trim.o diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 895ffbf0a..712af664e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -704,8 +704,9 @@ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) return (SET_ERROR(EBUSY)); return (0); @@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, NULL, + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); + nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } @@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + /* + * When starting a resilver clear any existing rebuild state. + * This is required to prevent stale rebuild status from + * being reported when a rebuild is run, then a resilver and + * finally a scrub. In which case only the scrub status + * should be reported by 'zpool status'. + */ + if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + vdev_rebuild_clear_sync( + (void *)(uintptr_t)vd->vdev_id, tx); + } + } } /* back to the generic stuff */ @@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - scn->scn_phys.scn_max_txg, B_TRUE); - - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); + + if (scn->scn_phys.scn_min_txg) { + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, + ESC_ZFS_RESILVER_FINISH); + nvlist_free(aux); + } else { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_SCRUB_FINISH); + } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - 0, B_TRUE); + 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 943330886..6b60227d2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -57,6 +57,7 @@ #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> #include <sys/vdev_initialize.h> +#include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> #include <sys/metaslab.h> @@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa) vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); @@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) update_config_cache); /* - * Check all DTLs to see if anything needs resilvering. + * Check if a rebuild was in progress and if so resume it. + * Then check all DTLs to see if anything needs resilvering. + * The resilver will be deferred if a rebuild was started. */ - if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + if (vdev_rebuild_active(spa->spa_root_vdev)) { + vdev_rebuild_restart(spa); + } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); + } /* * Log the fact that we booted up (so that we can detect if @@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. + * + * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) + * should be performed instead of traditional healing reconstruction. From + * an administrators perspective these are both resilver operations. */ int -spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, + int rebuild) { uint64_t txg, dtl_max_txg; - vdev_t *rvd __maybe_unused = spa->spa_root_vdev; + vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; @@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) return (spa_vdev_exit(spa, NULL, txg, error)); } + if (rebuild) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + if (dsl_scan_resilvering(spa_get_dsl(spa))) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RESILVER_IN_PROGRESS)); + } else { + if (vdev_rebuild_active(rvd)) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_REBUILD_IN_PROGRESS)); + } + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); @@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + if (rebuild) { + /* + * For rebuilds, the parent vdev must support reconstruction + * using only space maps. This means the only allowable + * parents are the root vdev or a mirror vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + } + if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root @@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid @@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } } - /* mark the device being resilvered */ - newvd->vdev_resilver_txg = txg; - /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_dirty(tvd, VDD_DTL, newvd, txg); /* - * Schedule the resilver to restart in the future. We do this to - * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. We do not do this if resilvers have been - * deferred. + * Schedule the resilver or rebuild to restart in the future. We do + * this to ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. */ - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_defer_resilver(newvd); - else - dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); + } else { + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } + } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) ASSERT(spa_writeable(spa)); - txg = spa_vdev_enter(spa); + txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa) } spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * If a detach was not performed above replace waiters will not have + * been notified. In which case we must do so now. + */ + spa_notify_waiters(spa); } /* @@ -7971,9 +8021,21 @@ spa_async_thread(void *arg) spa_vdev_resilver_done(spa); /* + * If any devices are done replacing, detach them. Then if no + * top-level vdevs are rebuilding attempt to kick off a scrub. + */ + if (tasks & SPA_ASYNC_REBUILD_DONE) { + spa_vdev_resilver_done(spa); + + if (!vdev_rebuild_active(spa->spa_root_vdev)) + (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); + } + + /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && + !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); @@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: + if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + break; + /* fall through */ case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 61cefa3dd..4c884409a 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1166,6 +1166,30 @@ spa_vdev_enter(spa_t *spa) } /* + * The same as spa_vdev_enter() above but additionally takes the guid of + * the vdev being detached. When there is a rebuild in process it will be + * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). + * The rebuild is canceled if only a single child remains after the detach. + */ +uint64_t +spa_vdev_detach_enter(spa_t *spa, uint64_t guid) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + + vdev_autotrim_stop_all(spa); + + if (guid != 0) { + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd) { + vdev_rebuild_stop_wait(vd->vdev_top); + } + } + + return (spa_vdev_config_enter(spa)); +} + +/* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. @@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) /* * Reassess the DTLs. */ - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; @@ -1271,6 +1295,7 @@ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); + vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); @@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) } if (vd != NULL || error == 0) - vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE); + vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 03360120a..27ac17fea 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -39,6 +39,7 @@ #include <sys/dmu_tx.h> #include <sys/dsl_dir.h> #include <sys/vdev_impl.h> +#include <sys/vdev_rebuild.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); @@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } + txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, @@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + &vd->vdev_rebuild_txg); + if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); @@ -890,6 +902,7 @@ vdev_free(vdev_t *vd) ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); + mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); @@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); + mutex_destroy(&vd->vdev_rebuild_lock); + mutex_destroy(&vd->vdev_rebuild_io_lock); + cv_destroy(&vd->vdev_rebuild_cv); + cv_destroy(&vd->vdev_rebuild_io_cv); + zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); + ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_rebuilding = svd->vdev_rebuilding; + tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; @@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; + svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) @@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd) * excise the DTLs. */ static boolean_t -vdev_dtl_should_excise(vdev_t *vd) +vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { - spa_t *spa = vd->vdev_spa; - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd) if (vd->vdev_resilver_deferred) return (B_FALSE); - if (vd->vdev_resilver_txg == 0 || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); - /* - * When a resilver is initiated the scan will assign the scn_max_txg - * value to the highest txg value that exists in all DTLs. If this - * device's max DTL is not part of this scan (i.e. it is not in - * the range (scn_min_txg, scn_max_txg] then it is not eligible - * for excision. - */ - if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { - ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); - ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); - ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); - return (B_TRUE); + if (rebuild_done) { + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + /* Rebuild not initiated by attach */ + if (vd->vdev_rebuild_txg == 0) + return (B_TRUE); + + /* + * When a rebuild completes without error then all missing data + * up to the rebuild max txg has been reconstructed and the DTL + * is eligible for excision. + */ + if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && + vdev_dtl_max(vd) <= vrp->vrp_max_txg) { + ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); + ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); + return (B_TRUE); + } + } else { + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; + + /* Resilver not initiated by attach */ + if (vd->vdev_resilver_txg == 0) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the + * scn_max_txg value to the highest txg value that exists + * in all DTLs. If this device's max DTL is not part of this + * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] + * then it is not eligible for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); + return (B_TRUE); + } } + return (B_FALSE); } @@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd) * write operations will be issued to the pool. */ void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; @@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, - scrub_txg, scrub_done); + scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* - * If requested, pretend the scan completed cleanly. + * If requested, pretend the scan or rebuild completed cleanly. */ - if (zfs_scan_ignore_errors && scn) - scn->scn_phys.scn_errors = 0; + if (zfs_scan_ignore_errors) { + if (scn != NULL) + scn->scn_phys.scn_errors = 0; + if (vr != NULL) + vr->vr_rebuild_phys.vrp_errors = 0; + } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { @@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) } /* - * If we've completed a scan cleanly then determine - * if this vdev should remove any DTLs. We only want to - * excise regions on vdevs that were available during - * the entire duration of this scan. + * If we've completed a scrub/resilver or a rebuild cleanly + * then determine if this vdev should remove any DTLs. We + * only want to excise regions on vdevs that were available + * during the entire duration of this scan. */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || - (scn != NULL && scn->scn_phys.scn_errors == 0)) && - vdev_dtl_should_excise(vd)) { + if (rebuild_done && + vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { + check_excise = B_TRUE; + } else { + if (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) { + check_excise = B_TRUE; + } + } + + if (scrub_txg && check_excise && + vdev_dtl_should_excise(vd, rebuild_done)) { /* - * We completed a scrub up to scrub_txg. If we - * did it without rebooting, then the scrub dtl - * will be valid, so excise the old region and - * fold in the scrub dtl. Otherwise, leave the - * dtl as-is if there was an error. + * We completed a scrub, resilver or rebuild up to + * scrub_txg. If we did it without rebooting, then + * the scrub dtl will be valid, so excise the old + * region and fold in the scrub dtl. Otherwise, + * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference @@ -2711,15 +2776,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* - * If the vdev was resilvering and no longer has any - * DTLs then reset its resilvering flag and dirty + * If the vdev was resilvering or rebuilding and no longer + * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ - if (txg != 0 && vd->vdev_resilver_txg != 0 && + if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { - vd->vdev_resilver_txg = 0; - vdev_config_dirty(vd->vdev_top); + if (vd->vdev_rebuild_txg != 0) { + vd->vdev_rebuild_txg = 0; + vdev_config_dirty(vd->vdev_top); + } else if (vd->vdev_resilver_txg != 0) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } } mutex_exit(&vd->vdev_dtl_lock); @@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd) * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, @@ -3066,6 +3136,20 @@ vdev_load(vdev_t *vd) } /* + * Load any rebuild state from the top-level vdev zap. + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + error = vdev_rebuild_load(vd); + if (error && error != ENOTSUP) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " + "failed [error=%d]", error); + return (error); + } + } + + /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { @@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; @@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; + + /* Set when there is a deferred resilver. */ + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } + /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab @@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } + + /* + * Report fragmentation and rebuild progress for top-level, + * non-auxiliary, concrete devices. + */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } vdev_get_stats_ex_impl(vd, vs, vsx); @@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { + /* + * Repair is the result of a resilver issued by the + * scan thread (spa_sync). + */ if (flags & ZIO_FLAG_SCAN_THREAD) { - dsl_scan_phys_t *scn_phys = - &spa->spa_dsl_pool->dp_scan->scn_phys; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; - /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } + /* + * Repair is the result of a rebuild issued by the + * rebuild thread (vdev_rebuild_thread). + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + vdev_t *tvd = vd->vdev_top; + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; + + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(rebuilt, psize); + vs->vs_rebuild_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; + zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as @@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; + /* + * Solely for the purposes of 'zpool iostat -lqrw' + * reporting use the priority to catagorize the IO. + * Only the following are reported to user space: + * + * ZIO_PRIORITY_SYNC_READ, + * ZIO_PRIORITY_SYNC_WRITE, + * ZIO_PRIORITY_ASYNC_READ, + * ZIO_PRIORITY_ASYNC_WRITE, + * ZIO_PRIORITY_SCRUB, + * ZIO_PRIORITY_TRIM. + */ + if (priority == ZIO_PRIORITY_REBUILD) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_SCRUB); + } else if (priority == ZIO_PRIORITY_INITIALIZING) { + ASSERT3U(type, ==, ZIO_TYPE_WRITE); + priority = ZIO_PRIORITY_ASYNC_WRITE; + } else if (priority == ZIO_PRIORITY_REMOVAL) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_ASYNC_READ); + } + vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { - vsx->vsx_agg_histo[zio->io_priority] + vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { - vsx->vsx_ind_histo[zio->io_priority] + vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { - vsx->vsx_queue_histo[zio->io_priority] + vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 81cfd5cce..8c7468255 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) } } +static void +top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + if (vd == vd->vdev_top) { + vdev_rebuild_stat_t vrs; + if (vdev_rebuild_get_stats(vd, &vrs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, + sizeof (vrs) / sizeof (uint64_t)); + } + } +} + /* * Generate the nvlist representing this vdev's config. */ @@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); + top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context @@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); + if (vd->vdev_rebuild_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 3edd65c01..094530e9b 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_abd, zio->io_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e31271dcb..a8ef3d747 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; uint32_t zfs_vdev_trim_min_active = 1; uint32_t zfs_vdev_trim_max_active = 2; +uint32_t zfs_vdev_rebuild_min_active = 1; +uint32_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_initializing_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_min_active); default: panic("invalid priority %u", p); return (0); @@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); return (0); @@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } } else if (zio->io_type == ZIO_TYPE_WRITE) { @@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } } else { @@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, + "Max active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, + "Min active rebuild I/Os per vdev"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */ diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c new file mode 100644 index 000000000..bf1079fd7 --- /dev/null +++ b/module/zfs/vdev_rebuild.c @@ -0,0 +1,1106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include <sys/vdev_impl.h> +#include <sys/dsl_scan.h> +#include <sys/spa_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/vdev_rebuild.h> +#include <sys/zio.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zap.h> + +/* + * This file contains the sequential reconstruction implementation for + * resilvering. This form of resilvering is internally referred to as device + * rebuild to avoid conflating it with the traditional healing reconstruction + * performed by the dsl scan code. + * + * When replacing a device, or scrubbing the pool, ZFS has historically used + * a process called resilvering which is a form of healing reconstruction. + * This approach has the advantage that as blocks are read from disk their + * checksums can be immediately verified and the data repaired. Unfortunately, + * it also results in a random IO pattern to the disk even when extra care + * is taken to sequentialize the IO as much as possible. This substantially + * increases the time required to resilver the pool and restore redundancy. + * + * For mirrored devices it's possible to implement an alternate sequential + * reconstruction strategy when resilvering. Sequential reconstruction + * behaves like a traditional RAID rebuild and reconstructs a device in LBA + * order without verifying the checksum. After this phase completes a second + * scrub phase is started to verify all of the checksums. This two phase + * process will take longer than the healing reconstruction described above. + * However, it has that advantage that after the reconstruction first phase + * completes redundancy has been restored. At this point the pool can incur + * another device failure without risking data loss. + * + * There are a few noteworthy limitations and other advantages of resilvering + * using sequential reconstruction vs healing reconstruction. + * + * Limitations: + * + * - Only supported for mirror vdev types. Due to the variable stripe + * width used by raidz sequential reconstruction is not possible. + * + * - Block checksums are not verified during sequential reconstuction. + * Similar to traditional RAID the parity/mirror data is reconstructed + * but cannot be immediately double checked. For this reason when the + * last active resilver completes the pool is automatically scrubbed. + * + * - Deferred resilvers using sequential reconstruction are not currently + * supported. When adding another vdev to an active top-level resilver + * it must be restarted. + * + * Advantages: + * + * - Sequential reconstuction is performed in LBA order which may be faster + * than healing reconstuction particularly when using using HDDs (or + * especially with SMR devices). Only allocated capacity is resilvered. + * + * - Sequential reconstruction is not constrained by ZFS block boundaries. + * This allows it to issue larger IOs to disk which span multiple blocks + * allowing all of these logical blocks to be repaired with a single IO. + * + * - Unlike a healing resilver or scrub which are pool wide operations, + * sequential reconstruction is handled by the top-level mirror vdevs. + * This allows for it to be started or canceled on a top-level vdev + * without impacting any other top-level vdevs in the pool. + * + * - Data only referenced by a pool checkpoint will be repaired because + * that space is reflected in the space maps. This differs for a + * healing resilver or scrub which will not repair that data. + */ + + +/* + * Maximum number of queued rebuild I/Os top-level vdev. The number of + * concurrent rebuild I/Os issued to the device is controlled by the + * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module + * options. + */ +unsigned int zfs_rebuild_queue_limit = 20; + +/* + * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + */ +unsigned long zfs_rebuild_max_segment = 1024 * 1024; + +/* + * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). + */ +static void vdev_rebuild_thread(void *arg); + +/* + * Clear the per-vdev rebuild bytes value for a vdev tree. + */ +static void +clear_rebuild_bytes(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (uint64_t i = 0; i < vd->vdev_children; i++) + clear_rebuild_bytes(vd->vdev_child[i]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_rebuild_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Determines whether a vdev_rebuild_thread() should be stopped. + */ +static boolean_t +vdev_rebuild_should_stop(vdev_t *vd) +{ + return (!vdev_writeable(vd) || vd->vdev_removing || + vd->vdev_rebuild_exit_wanted || + vd->vdev_rebuild_cancel_wanted || + vd->vdev_rebuild_reset_wanted); +} + +/* + * Determine if the rebuild should be canceled. This may happen when all + * vdevs with MISSING DTLs are detached. + */ +static boolean_t +vdev_rebuild_should_cancel(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The sync task for updating the on-disk state of a rebuild. This is + * scheduled by vdev_rebuild_range(). + */ +static void +vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { + vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; + vr->vr_scan_offset[txg & TXG_MASK] = 0; + } + + vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Initialize the on-disk state for a new rebuild, start the rebuild thread. + */ +static void +vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_rebuilding); + + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + mutex_enter(&vd->vdev_rebuild_lock); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_start_time = gethrestime_sec(); + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* + * Rebuilds are currently only used when replacing a device, in which + * case there must be DTL_MISSING entries. In the future, we could + * allow rebuilds to be used in a way similar to a scrub. This would + * be useful because it would allow us to rebuild the space used by + * pool checkpoints. + */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu started", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) +{ + nvlist_t *aux = fnvlist_alloc(); + + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); + spa_event_notify(spa, vd, aux, name); + nvlist_free(aux); +} + +/* + * Called to request that a new rebuild be started. The feature will remain + * active for the duration of the rebuild, then revert to the enabled state. + */ +static void +vdev_rebuild_initiate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_top == vd); + ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); + ASSERT(!vd->vdev_rebuilding); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + vd->vdev_rebuilding = B_TRUE; + + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); +} + +/* + * Update the on-disk state to completed when a rebuild finishes. + */ +static void +vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu complete", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + /* Handles detaching of spares */ + spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Update the on-disk state to canceled when a rebuild finishes. + */ +static void +vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu canceled", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + vd->vdev_rebuild_cancel_wanted = B_FALSE; + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Resets the progress of a running rebuild. This will occur when a new + * vdev is added to rebuild. + */ +static void +vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + + vrp->vrp_last_offset = 0; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_bytes_scanned = 0; + vrp->vrp_bytes_issued = 0; + vrp->vrp_bytes_rebuilt = 0; + vrp->vrp_bytes_est = 0; + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* See vdev_rebuild_initiate_sync comment */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu reset", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + vd->vdev_rebuild_reset_wanted = B_FALSE; + ASSERT(vd->vdev_rebuilding); + + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Clear the last rebuild status. + */ +void +vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + objset_t *mos = spa_meta_objset(spa); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || + vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { + mutex_exit(&vd->vdev_rebuild_lock); + return; + } + + clear_rebuild_bytes(vd); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + + if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { + VERIFY0(zap_update(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + } + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * The zio_done_func_t callback for each rebuild I/O issued. It's responsible + * for updating the rebuild stats and limiting the number of in flight I/Os. + */ +static void +vdev_rebuild_cb(zio_t *zio) +{ + vdev_rebuild_t *vr = zio->io_private; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vdev_t *vd = vr->vr_top_vdev; + + mutex_enter(&vd->vdev_rebuild_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the top-level vdev was unavailable. + * Attempt to roll back to the last completed offset, in order + * resume from the correct location if the pool is resumed. + * (This works because spa_sync waits on spa_txg_zio before + * it runs sync tasks.) + */ + uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else if (zio->io_error) { + vrp->vrp_errors++; + } + + abd_free(zio->io_abd); + + ASSERT3U(vd->vdev_rebuild_inflight, >, 0); + vd->vdev_rebuild_inflight--; + cv_broadcast(&vd->vdev_rebuild_io_cv); + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * Rebuild the data in this range by constructing a special dummy block + * pointer for the given range. It has no relation to any existing blocks + * in the pool. But by disabling checksum verification and issuing a scrub + * I/O mirrored vdevs will replicate the block using any available mirror + * leaf vdevs. + */ +static void +vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, + uint64_t txg) +{ + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + uint64_t psize = asize; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + blkptr_t blk, *bp = &blk; + BP_ZERO(bp); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], start); + DVA_SET_GANG(&bp->blk_dva[0], 0); + DVA_SET_ASIZE(&bp->blk_dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + /* + * We increment the issued bytes by the asize rather than the psize + * so the scanned and issued bytes may be directly compared. This + * is consistent with the scrub/resilver issued reporting. + */ + vr->vr_pass_bytes_issued += asize; + vr->vr_rebuild_phys.vrp_bytes_issued += asize; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); +} + +/* + * Issues a rebuild I/O and takes care of rate limiting the number of queued + * rebuild I/Os. The provided start and size must be properly aligned for the + * top-level vdev type being rebuilt. + */ +static int +vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) +{ + uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + + ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); + ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); + + vr->vr_pass_bytes_scanned += size; + vr->vr_rebuild_phys.vrp_bytes_scanned += size; + + mutex_enter(&vd->vdev_rebuild_io_lock); + + /* Limit in flight rebuild I/Os */ + while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + vd->vdev_rebuild_inflight++; + mutex_exit(&vd->vdev_rebuild_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + /* This is the first I/O for this txg. */ + if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { + vr->vr_scan_offset[txg & TXG_MASK] = start; + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_rebuild_update_sync, + (void *)(uintptr_t)vd->vdev_id, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* When exiting write out our progress. */ + if (vdev_rebuild_should_stop(vd)) { + mutex_enter(&vd->vdev_rebuild_io_lock); + vd->vdev_rebuild_inflight--; + mutex_exit(&vd->vdev_rebuild_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_rebuild_lock); + + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vdev_rebuild_rebuild_block(vr, start, size, txg); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Split range into legally-sized logical chunks given the constraints of the + * top-level mirror vdev type. + */ +static uint64_t +vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) +{ + uint64_t chunk_size, max_asize, max_segment; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, + 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); + max_asize = vdev_psize_to_asize(vd, max_segment); + chunk_size = MIN(size, max_asize); + + return (chunk_size); +} + +/* + * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. + */ +static int +vdev_rebuild_ranges(vdev_rebuild_t *vr) +{ + vdev_t *vd = vr->vr_top_vdev; + zfs_btree_t *t = &vr->vr_scan_tree->rt_root; + zfs_btree_index_t idx; + int error; + + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t start = rs_get_start(rs, vr->vr_scan_tree); + uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; + + /* + * zfs_scan_suspend_progress can be set to disable rebuild + * progress for testing. See comment in dsl_scan_sync(). + */ + while (zfs_scan_suspend_progress && + !vdev_rebuild_should_stop(vd)) { + delay(hz); + } + + while (size > 0) { + uint64_t chunk_size; + + chunk_size = vdev_rebuild_chunk_size(vd, start, size); + + error = vdev_rebuild_range(vr, start, chunk_size); + if (error != 0) + return (error); + + size -= chunk_size; + start += chunk_size; + } + } + + return (0); +} + +/* + * Calculates the estimated capacity which remains to be scanned. Since + * we traverse the pool in metaslab order only allocated capacity beyond + * the vrp_last_offset need be considered. All lower offsets must have + * already been rebuilt and are thus already included in vrp_bytes_scanned. + */ +static void +vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t bytes_est = vrp->vrp_bytes_scanned; + + if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) + return; + + for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + + mutex_enter(&msp->ms_lock); + bytes_est += metaslab_allocated_space(msp); + mutex_exit(&msp->ms_lock); + } + + vrp->vrp_bytes_est = bytes_est; +} + +/* + * Load from disk the top-level vdev's rebuild information. + */ +int +vdev_rebuild_load(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + spa_t *spa = vd->vdev_spa; + int err = 0; + + mutex_enter(&vd->vdev_rebuild_lock); + vd->vdev_rebuilding = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + mutex_exit(&vd->vdev_rebuild_lock); + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(vd->vdev_top == vd); + + err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp); + + /* + * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should + * not prevent a pool from being imported. Clear the rebuild + * status allowing a new resilver/rebuild to be started. + */ + if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + } else if (err) { + mutex_exit(&vd->vdev_rebuild_lock); + return (err); + } + + vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; + vr->vr_top_vdev = vd; + + mutex_exit(&vd->vdev_rebuild_lock); + + return (0); +} + +/* + * Each scan thread is responsible for rebuilding a top-level vdev. The + * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. + */ +static void +vdev_rebuild_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + /* + * If there's a scrub in process request that it be stopped. This + * is not required for a correct rebuild, but we do want rebuilds to + * emulate the resilver behavior as much as possible. + */ + dsl_pool_t *dsl = spa_get_dsl(spa); + if (dsl_scan_scrubbing(dsl)) + dsl_scan_cancel(dsl); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); + ASSERT(vd->vdev_rebuilding); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); + ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); + ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); + + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vr->vr_top_vdev = vd; + vr->vr_scan_msp = NULL; + vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + vr->vr_pass_start_time = gethrtime(); + vr->vr_pass_bytes_scanned = 0; + vr->vr_pass_bytes_issued = 0; + + uint64_t update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, 0); + + clear_rebuild_bytes(vr->vr_top_vdev); + + mutex_exit(&vd->vdev_rebuild_lock); + + /* + * Systematically walk the metaslabs and issue rebuild I/Os for + * all ranges in the allocated space map. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + vr->vr_scan_msp = msp; + + /* + * Removal of vdevs from the vdev tree may eliminate the need + * for the rebuild, in which case it should be canceled. The + * vdev_rebuild_cancel_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (vdev_rebuild_should_cancel(vd)) { + vd->vdev_rebuild_cancel_wanted = B_TRUE; + error = EINTR; + break; + } + + ASSERT0(range_tree_space(vr->vr_scan_tree)); + + /* + * Disable any new allocations to this metaslab and wait + * for any writes inflight to complete. This is needed to + * ensure all allocated ranges are rebuilt. + */ + metaslab_disable(msp); + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(dsl, 0); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * When a metaslab has been allocated from read its allocated + * ranges from the space map object in to the vr_scan_tree. + * Then add inflight / unflushed ranges and remove inflight / + * unflushed frees. This is the minimum range to be rebuilt. + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + vr->vr_scan_tree, SM_ALLOC)); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space( + msp->ms_allocating[i])); + } + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, vr->vr_scan_tree); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, vr->vr_scan_tree); + + /* + * Remove ranges which have already been rebuilt based + * on the last offset. This can happen when restarting + * a scan after exporting and re-importing the pool. + */ + range_tree_clear(vr->vr_scan_tree, 0, + vrp->vrp_last_offset); + } + + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + /* + * To provide an accurate estimate re-calculate the estimated + * size every 5 minutes to account for recent allocations and + * frees made space maps which have not yet been rebuilt. + */ + if (gethrtime() > update_est_time + SEC2NSEC(300)) { + update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, i); + } + + /* + * Walk the allocated space map and issue the rebuild I/O. + */ + error = vdev_rebuild_ranges(vr); + range_tree_vacate(vr->vr_scan_tree, NULL, NULL); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + metaslab_enable(msp, B_FALSE, B_FALSE); + + if (error != 0) + break; + } + + range_tree_destroy(vr->vr_scan_tree); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* Wait for any remaining rebuild I/O to complete */ + mutex_enter(&vd->vdev_rebuild_io_lock); + while (vd->vdev_rebuild_inflight > 0) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + dsl_pool_t *dp = spa_get_dsl(spa); + dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (error == 0) { + /* + * After a successful rebuild clear the DTLs of all ranges + * which were missing when the rebuild was started. These + * ranges must have been rebuilt as a consequence of rebuilding + * all allocated space. Note that unlike a scrub or resilver + * the rebuild operation will reconstruct data only referenced + * by a pool checkpoint. See the dsl_scan_done() comments. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_cancel_wanted) { + /* + * The rebuild operation was canceled. This will occur when + * a device participating in the rebuild is detached. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_reset_wanted) { + /* + * Reset the running rebuild without canceling and restarting + * it. This will occur when a new device is attached and must + * participate in the rebuild. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else { + /* + * The rebuild operation should be suspended. This may occur + * when detaching a child vdev or when exporting the pool. The + * rebuild is left in the active state so it will be resumed. + */ + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + vd->vdev_rebuilding = B_FALSE; + } + + dmu_tx_commit(tx); + + vd->vdev_rebuild_thread = NULL; + mutex_exit(&vd->vdev_rebuild_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Returns B_TRUE if any top-level vdev are rebuilding. + */ +boolean_t +vdev_rebuild_active(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + boolean_t ret = B_FALSE; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + ret = vdev_rebuild_active(vd->vdev_child[i]); + if (ret) + return (ret); + } + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + mutex_exit(&vd->vdev_rebuild_lock); + } + + return (ret); +} + +/* + * Start a rebuild operation. The rebuild may be restarted when the + * top-level vdev is currently actively rebuilding. + */ +void +vdev_rebuild(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_top == vd); + ASSERT(vdev_is_concrete(vd)); + ASSERT(!vd->vdev_removing); + ASSERT(spa_feature_is_enabled(vd->vdev_spa, + SPA_FEATURE_DEVICE_REBUILD)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuilding) { + ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); + + /* + * Signal a running rebuild operation that it should restart + * from the beginning because a new device was attached. The + * vdev_rebuild_reset_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (!vd->vdev_rebuild_reset_wanted) + vd->vdev_rebuild_reset_wanted = B_TRUE; + } else { + vdev_rebuild_initiate(vd); + } + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_restart_impl(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_restart_impl(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && + vdev_writeable(vd) && !vd->vdev_rebuilding) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DEVICE_REBUILD)); + vd->vdev_rebuilding = B_TRUE; + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, + maxclsyspri); + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Conditionally restart all of the vdev_rebuild_thread's for a pool. The + * feature flag must be active and the rebuild in the active state. This + * cannot be used to start a new rebuild. + */ +void +vdev_rebuild_restart(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vdev_rebuild_restart_impl(spa->spa_root_vdev); +} + +/* + * Stop and wait for all of the vdev_rebuild_thread's associated with the + * vdev tree provide to be terminated (canceled or stopped). + */ +void +vdev_rebuild_stop_wait(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_stop_wait(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuild_thread != NULL) { + vd->vdev_rebuild_exit_wanted = B_TRUE; + while (vd->vdev_rebuilding) { + cv_wait(&vd->vdev_rebuild_cv, + &vd->vdev_rebuild_lock); + } + vd->vdev_rebuild_exit_wanted = B_FALSE; + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Stop all rebuild operations but leave them in the active state so they + * will be resumed when importing the pool. + */ +void +vdev_rebuild_stop_all(spa_t *spa) +{ + vdev_rebuild_stop_wait(spa->spa_root_vdev); +} + +/* + * Rebuild statistics reported per top-level vdev. + */ +int +vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) +{ + spa_t *spa = tvd->vdev_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (SET_ERROR(ENOTSUP)); + + if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) + return (SET_ERROR(EINVAL)); + + int error = zap_contains(spa_meta_objset(spa), + tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); + + if (error == ENOENT) { + bzero(vrs, sizeof (vdev_rebuild_stat_t)); + vrs->vrs_state = VDEV_REBUILD_NONE; + error = 0; + } else if (error == 0) { + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&tvd->vdev_rebuild_lock); + vrs->vrs_state = vrp->vrp_rebuild_state; + vrs->vrs_start_time = vrp->vrp_start_time; + vrs->vrs_end_time = vrp->vrp_end_time; + vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; + vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; + vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; + vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; + vrs->vrs_bytes_est = vrp->vrp_bytes_est; + vrs->vrs_errors = vrp->vrp_errors; + vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - + vr->vr_pass_start_time); + vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; + vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + mutex_exit(&tvd->vdev_rebuild_lock); + } + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, + "Max segment size in bytes of rebuild reads"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 4122114b5..1d2ae6270 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1938,8 +1938,9 @@ static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; - int replacing = zc->zc_cookie; nvlist_t *config; + int replacing = zc->zc_cookie; + int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) @@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, + rebuild); nvlist_free(config); } |