summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zed/agents/zfs_mod.c2
-rw-r--r--cmd/zed/agents/zfs_retire.c11
-rwxr-xr-xcmd/zed/zed.d/resilver_finish-start-scrub.sh2
-rw-r--r--cmd/zpool/zpool_main.c374
-rw-r--r--cmd/ztest/ztest.c16
-rw-r--r--configure.ac1
-rw-r--r--contrib/pyzfs/libzfs_core/_constants.py2
-rw-r--r--include/libzfs.h7
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/dsl_scan.h2
-rw-r--r--include/sys/fs/zfs.h38
-rw-r--r--include/sys/spa.h10
-rw-r--r--include/sys/spa_impl.h1
-rw-r--r--include/sys/vdev.h2
-rw-r--r--include/sys/vdev_impl.h17
-rw-r--r--include/sys/vdev_rebuild.h97
-rw-r--r--include/sys/zio_priority.h1
-rw-r--r--include/zfeature_common.h1
-rw-r--r--lib/libzfs/libzfs_pool.c26
-rw-r--r--lib/libzfs/libzfs_status.c47
-rw-r--r--lib/libzfs/libzfs_util.c9
-rw-r--r--lib/libzpool/Makefile.am1
-rw-r--r--man/man5/zfs-module-parameters.536
-rw-r--r--man/man5/zpool-features.529
-rw-r--r--man/man8/zpool-attach.823
-rw-r--r--man/man8/zpool-replace.814
-rw-r--r--man/man8/zpool-status.84
-rw-r--r--module/Makefile.bsd1
-rw-r--r--module/zcommon/zfeature_common.c5
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/dsl_scan.c44
-rw-r--r--module/zfs/spa.c109
-rw-r--r--module/zfs/spa_misc.c29
-rw-r--r--module/zfs/vdev.c238
-rw-r--r--module/zfs/vdev_label.c17
-rw-r--r--module/zfs/vdev_mirror.c5
-rw-r--r--module/zfs/vdev_queue.c18
-rw-r--r--module/zfs/vdev_rebuild.c1106
-rw-r--r--module/zfs/zfs_ioctl.c6
-rw-r--r--tests/runfiles/common.run13
-rw-r--r--tests/zfs-tests/include/libtest.shlib25
-rw-r--r--tests/zfs-tests/tests/functional/Makefile.am1
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg1
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am1
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh64
-rw-r--r--tests/zfs-tests/tests/functional/replacement/Makefile.am17
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/attach_import.ksh67
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/attach_multiple.ksh111
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh173
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/attach_resilver.ksh (renamed from tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh)16
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/detach.ksh (renamed from tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh)10
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh78
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh126
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh70
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/replace_import.ksh67
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh158
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/replace_resilver.ksh (renamed from tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh)9
-rw-r--r--tests/zfs-tests/tests/functional/replacement/replacement.cfg5
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh (renamed from tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh)39
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh (renamed from tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh)24
-rwxr-xr-xtests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh112
-rw-r--r--tests/zfs-tests/tests/functional/resilver/Makefile.am9
-rwxr-xr-xtests/zfs-tests/tests/functional/resilver/cleanup.ksh31
-rw-r--r--tests/zfs-tests/tests/functional/resilver/resilver.cfg32
-rwxr-xr-xtests/zfs-tests/tests/functional/resilver/setup.ksh31
65 files changed, 3281 insertions, 362 deletions
diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index 1094d25dd..8d0a3b420 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -437,7 +437,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
return;
}
- ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
+ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index f3dbb24b8..665fb216d 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -237,7 +237,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
dev_name, basename(spare_name));
if (zpool_vdev_attach(zhp, dev_name, spare_name,
- replacement, B_TRUE) == 0) {
+ replacement, B_TRUE, B_FALSE) == 0) {
free(dev_name);
nvlist_free(replacement);
return (B_TRUE);
@@ -319,12 +319,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
+ nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
+
/*
* If this is a resource notifying us of device removal then simply
* check for an available spare and continue unless the device is a
* l2arc vdev, in which case we just offline it.
*/
- if (strcmp(class, "resource.fs.zfs.removed") == 0) {
+ if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
+ (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+ state == VDEV_STATE_REMOVED)) {
char *devtype;
char *devname;
@@ -365,8 +369,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
* healthy ones so we need to confirm the actual state value.
*/
if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
- nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE,
- &state) == 0 && state == VDEV_STATE_HEALTHY) {
+ state == VDEV_STATE_HEALTHY) {
zfs_vdev_repair(hdl, nvl);
return;
}
diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh
index 6f9c0b309..c7cfd1ddb 100755
--- a/cmd/zed/zed.d/resilver_finish-start-scrub.sh
+++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh
@@ -5,10 +5,12 @@
# Exit codes:
# 1: Internal error
# 2: Script wasn't enabled in zed.rc
+# 3: Scrubs are automatically started for sequential resilvers
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2
+[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3
[ -n "${ZEVENT_POOL}" ] || exit 1
[ -n "${ZEVENT_SUBCLASS}" ] || exit 1
zed_check_cmd "${ZPOOL}" || exit 1
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index ee6c479eb..cdf5511fe 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -337,7 +337,7 @@ get_usage(zpool_help_t idx)
return (gettext("\tadd [-fgLnP] [-o property=value] "
"<pool> <vdev> ...\n"));
case HELP_ATTACH:
- return (gettext("\tattach [-fw] [-o property=value] "
+ return (gettext("\tattach [-fsw] [-o property=value] "
"<pool> <device> <new-device>\n"));
case HELP_CLEAR:
return (gettext("\tclear [-nF] <pool> [device]\n"));
@@ -380,7 +380,7 @@ get_usage(zpool_help_t idx)
case HELP_ONLINE:
return (gettext("\tonline [-e] <pool> <device> ...\n"));
case HELP_REPLACE:
- return (gettext("\treplace [-fw] [-o property=value] "
+ return (gettext("\treplace [-fsw] [-o property=value] "
"<pool> <device> [new-device]\n"));
case HELP_REMOVE:
return (gettext("\tremove [-npsw] <pool> <device> ...\n"));
@@ -2077,10 +2077,10 @@ health_str_to_color(const char *health)
*/
static void
print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
- nvlist_t *nv, int depth, boolean_t isspare)
+ nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
{
nvlist_t **child, *root;
- uint_t c, children;
+ uint_t c, i, children;
pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6];
@@ -2266,6 +2266,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}
}
+ /* The top-level vdevs have the rebuild stats */
+ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
+ children == 0) {
+ if (vs->vs_rebuild_processed != 0) {
+ (void) printf(gettext(" (resilvering)"));
+ }
+ }
+
if (cb->vcdl != NULL) {
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
printf(" ");
@@ -2295,11 +2303,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
continue;
+ /* Provide vdev_rebuild_stats to children if available */
+ if (vrs == NULL) {
+ (void) nvlist_lookup_uint64_array(nv,
+ ZPOOL_CONFIG_REBUILD_STATS,
+ (uint64_t **)&vrs, &i);
+ }
+
vname = zpool_vdev_name(g_zfs, zhp, child[c],
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
-
print_status_config(zhp, cb, vname, child[c], depth + 2,
- isspare);
+ isspare, vrs);
free(vname);
}
}
@@ -2468,7 +2482,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
if (cb->cb_print_status)
print_status_config(zhp, cb, name, child[c], 2,
- B_FALSE);
+ B_FALSE, NULL);
else
print_import_config(cb, name, child[c], 2);
free(name);
@@ -2622,6 +2636,7 @@ show_import(nvlist_t *config)
break;
case ZPOOL_STATUS_RESILVERING:
+ case ZPOOL_STATUS_REBUILDING:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices were "
"being resilvered.\n"));
@@ -6118,6 +6133,7 @@ static int
zpool_do_attach_or_replace(int argc, char **argv, int replacing)
{
boolean_t force = B_FALSE;
+ boolean_t rebuild = B_FALSE;
boolean_t wait = B_FALSE;
int c;
nvlist_t *nvroot;
@@ -6128,7 +6144,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
int ret;
/* check options */
- while ((c = getopt(argc, argv, "fo:w")) != -1) {
+ while ((c = getopt(argc, argv, "fo:sw")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
@@ -6146,6 +6162,9 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
(add_prop_list(optarg, propval, &props, B_TRUE)))
usage(B_FALSE);
break;
+ case 's':
+ rebuild = B_TRUE;
+ break;
case 'w':
wait = B_TRUE;
break;
@@ -6230,7 +6249,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
return (1);
}
- ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);
+ ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
+ rebuild);
if (ret == 0 && wait)
ret = zpool_wait(zhp,
@@ -6244,9 +6264,10 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
}
/*
- * zpool replace [-fw] [-o property=value] <pool> <device> <new_device>
+ * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
+ * -s Use sequential instead of healing reconstruction for resilver.
* -o Set property=value.
* -w Wait for replacing to complete before returning
*
@@ -6260,9 +6281,10 @@ zpool_do_replace(int argc, char **argv)
}
/*
- * zpool attach [-fw] [-o property=value] <pool> <device> <new_device>
+ * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
+ * -s Use sequential instead of healing reconstruction for resilver.
* -o Set property=value.
* -w Wait for resilvering to complete before returning
*
@@ -7132,19 +7154,40 @@ zpool_do_trim(int argc, char **argv)
}
/*
+ * Converts a total number of seconds to a human readable string broken
+ * down in to days/hours/minutes/seconds.
+ */
+static void
+secs_to_dhms(uint64_t total, char *buf)
+{
+ uint64_t days = total / 60 / 60 / 24;
+ uint64_t hours = (total / 60 / 60) % 24;
+ uint64_t mins = (total / 60) % 60;
+ uint64_t secs = (total % 60);
+
+ if (days > 0) {
+ (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu",
+ (u_longlong_t)days, (u_longlong_t)hours,
+ (u_longlong_t)mins, (u_longlong_t)secs);
+ } else {
+ (void) sprintf(buf, "%02llu:%02llu:%02llu",
+ (u_longlong_t)hours, (u_longlong_t)mins,
+ (u_longlong_t)secs);
+ }
+}
+
+/*
* Print out detailed scrub status.
*/
static void
-print_scan_status(pool_scan_stat_t *ps)
+print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
- uint64_t total_secs_left;
- uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
uint64_t pass_scanned, scanned, pass_issued, issued, total;
- uint64_t scan_rate, issue_rate;
+ uint64_t elapsed, scan_rate, issue_rate;
double fraction_done;
char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
- char srate_buf[7], irate_buf[7];
+ char srate_buf[7], irate_buf[7], time_buf[32];
printf(" ");
printf_color(ANSI_BOLD, gettext("scan:"));
@@ -7168,26 +7211,18 @@ print_scan_status(pool_scan_stat_t *ps)
/* Scan is finished or canceled. */
if (ps->pss_state == DSS_FINISHED) {
- total_secs_left = end - start;
- days_left = total_secs_left / 60 / 60 / 24;
- hours_left = (total_secs_left / 60 / 60) % 24;
- mins_left = (total_secs_left / 60) % 60;
- secs_left = (total_secs_left % 60);
+ secs_to_dhms(end - start, time_buf);
if (ps->pss_func == POOL_SCAN_SCRUB) {
(void) printf(gettext("scrub repaired %s "
- "in %llu days %02llu:%02llu:%02llu "
- "with %llu errors on %s"), processed_buf,
- (u_longlong_t)days_left, (u_longlong_t)hours_left,
- (u_longlong_t)mins_left, (u_longlong_t)secs_left,
- (u_longlong_t)ps->pss_errors, ctime(&end));
+ "in %s with %llu errors on %s"), processed_buf,
+ time_buf, (u_longlong_t)ps->pss_errors,
+ ctime(&end));
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
(void) printf(gettext("resilvered %s "
- "in %llu days %02llu:%02llu:%02llu "
- "with %llu errors on %s"), processed_buf,
- (u_longlong_t)days_left, (u_longlong_t)hours_left,
- (u_longlong_t)mins_left, (u_longlong_t)secs_left,
- (u_longlong_t)ps->pss_errors, ctime(&end));
+ "in %s with %llu errors on %s"), processed_buf,
+ time_buf, (u_longlong_t)ps->pss_errors,
+ ctime(&end));
}
return;
} else if (ps->pss_state == DSS_CANCELED) {
@@ -7235,13 +7270,9 @@ print_scan_status(pool_scan_stat_t *ps)
scan_rate = pass_scanned / elapsed;
issue_rate = pass_issued / elapsed;
- total_secs_left = (issue_rate != 0 && total >= issued) ?
+ uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
((total - issued) / issue_rate) : UINT64_MAX;
-
- days_left = total_secs_left / 60 / 60 / 24;
- hours_left = (total_secs_left / 60 / 60) % 24;
- mins_left = (total_secs_left / 60) % 60;
- secs_left = (total_secs_left % 60);
+ secs_to_dhms(total_secs_left, time_buf);
/* format all of the numbers we will be reporting */
zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
@@ -7271,10 +7302,84 @@ print_scan_status(pool_scan_stat_t *ps)
if (pause == 0) {
if (total_secs_left != UINT64_MAX &&
issue_rate >= 10 * 1024 * 1024) {
- (void) printf(gettext(", %llu days "
- "%02llu:%02llu:%02llu to go\n"),
- (u_longlong_t)days_left, (u_longlong_t)hours_left,
- (u_longlong_t)mins_left, (u_longlong_t)secs_left);
+ (void) printf(gettext(", %s to go\n"), time_buf);
+ } else {
+ (void) printf(gettext(", no estimated "
+ "completion time\n"));
+ }
+ } else {
+ (void) printf(gettext("\n"));
+ }
+}
+
+static void
+print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
+{
+ if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
+ return;
+
+ printf(" ");
+ printf_color(ANSI_BOLD, gettext("scan:"));
+ printf(" ");
+
+ uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
+ uint64_t bytes_issued = vrs->vrs_bytes_issued;
+ uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
+ uint64_t bytes_est = vrs->vrs_bytes_est;
+ uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
+ (vrs->vrs_pass_time_ms + 1)) * 1000;
+ uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
+ (vrs->vrs_pass_time_ms + 1)) * 1000;
+ double scan_pct = MIN((double)bytes_scanned * 100 /
+ (bytes_est + 1), 100);
+
+ /* Format all of the numbers we will be reporting */
+ char bytes_scanned_buf[7], bytes_issued_buf[7];
+ char bytes_rebuilt_buf[7], bytes_est_buf[7];
+ char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
+ zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
+ sizeof (bytes_scanned_buf));
+ zfs_nicebytes(bytes_issued, bytes_issued_buf,
+ sizeof (bytes_issued_buf));
+ zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
+ sizeof (bytes_rebuilt_buf));
+ zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
+ zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
+ zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
+
+ time_t start = vrs->vrs_start_time;
+ time_t end = vrs->vrs_end_time;
+
+ /* Rebuild is finished or canceled. */
+ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) {
+ secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf);
+ (void) printf(gettext("resilvered (%s) %s in %s "
+ "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf,
+ time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end));
+ return;
+ } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) {
+ (void) printf(gettext("resilver (%s) canceled on %s"),
+ vdev_name, ctime(&end));
+ return;
+ } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ (void) printf(gettext("resilver (%s) in progress since %s"),
+ vdev_name, ctime(&start));
+ }
+
+ assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
+
+ secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
+ MAX(scan_rate, 1), time_buf);
+
+ (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
+ "%s total\n"), bytes_scanned_buf, scan_rate_buf,
+ bytes_issued_buf, issue_rate_buf, bytes_est_buf);
+ (void) printf(gettext("\t%s resilvered, %.2f%% done"),
+ bytes_rebuilt_buf, scan_pct);
+
+ if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ if (scan_rate >= 10 * 1024 * 1024) {
+ (void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
"completion time\n"));
@@ -7285,9 +7390,38 @@ print_scan_status(pool_scan_stat_t *ps)
}
/*
- * As we don't scrub checkpointed blocks, we want to warn the
- * user that we skipped scanning some blocks if a checkpoint exists
- * or existed at any time during the scan.
+ * Print rebuild status for top-level vdevs.
+ */
+static void
+print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (uint_t c = 0; c < children; c++) {
+ vdev_rebuild_stat_t *vrs;
+ uint_t i;
+
+ if (nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+ char *name = zpool_vdev_name(g_zfs, zhp,
+ child[c], VDEV_NAME_TYPE_ID);
+ print_rebuild_status_impl(vrs, name);
+ free(name);
+ }
+ }
+}
+
+/*
+ * As we don't scrub checkpointed blocks, we want to warn the user that we
+ * skipped scanning some blocks if a checkpoint exists or existed at any
+ * time during the scan. If a sequential instead of healing reconstruction
+ * was performed then the blocks were reconstructed. However, their checksums
+ * have not been verified so we still print the warning.
*/
static void
print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
@@ -7319,6 +7453,95 @@ print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
}
/*
+ * Returns B_TRUE if there is an active rebuild in progress. Otherwise,
+ * B_FALSE is returned and 'rebuild_end_time' is set to the end time for
+ * the last completed (or cancelled) rebuild.
+ */
+static boolean_t
+check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time)
+{
+ nvlist_t **child;
+ uint_t children;
+ boolean_t rebuilding = B_FALSE;
+ uint64_t end_time = 0;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (uint_t c = 0; c < children; c++) {
+ vdev_rebuild_stat_t *vrs;
+ uint_t i;
+
+ if (nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+
+ if (vrs->vrs_end_time > end_time)
+ end_time = vrs->vrs_end_time;
+
+ if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ rebuilding = B_TRUE;
+ end_time = 0;
+ break;
+ }
+ }
+ }
+
+ if (rebuild_end_time != NULL)
+ *rebuild_end_time = end_time;
+
+ return (rebuilding);
+}
+
+/*
+ * Print the scan status.
+ */
+static void
+print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+ uint64_t rebuild_end_time = 0, resilver_end_time = 0;
+ boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
+ boolean_t active_resilver = B_FALSE;
+ pool_checkpoint_stat_t *pcs = NULL;
+ pool_scan_stat_t *ps = NULL;
+ uint_t c;
+
+ if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &c) == 0) {
+ if (ps->pss_func == POOL_SCAN_RESILVER) {
+ resilver_end_time = ps->pss_end_time;
+ active_resilver = (ps->pss_state == DSS_SCANNING);
+ }
+
+ have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
+ have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
+ }
+
+ boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
+ boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
+
+ /* Always print the scrub status when available. */
+ if (have_scrub)
+ print_scan_scrub_resilver_status(ps);
+
+ /*
+ * When there is an active resilver or rebuild print its status.
+ * Otherwise print the status of the last resilver or rebuild.
+ */
+ if (active_resilver || (!active_rebuild && have_resilver &&
+ resilver_end_time && resilver_end_time > rebuild_end_time)) {
+ print_scan_scrub_resilver_status(ps);
+ } else if (active_rebuild || (!active_resilver && have_rebuild &&
+ rebuild_end_time && rebuild_end_time > resilver_end_time)) {
+ print_rebuild_status(zhp, nvroot);
+ }
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+ print_checkpoint_scan_warning(ps, pcs);
+}
+
+/*
* Print out detailed removal status.
*/
static void
@@ -7504,7 +7727,7 @@ print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares,
for (i = 0; i < nspares; i++) {
name = zpool_vdev_name(g_zfs, zhp, spares[i],
cb->cb_name_flags);
- print_status_config(zhp, cb, name, spares[i], 2, B_TRUE);
+ print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL);
free(name);
}
}
@@ -7524,7 +7747,8 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
for (i = 0; i < nl2cache; i++) {
name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
cb->cb_name_flags);
- print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE);
+ print_status_config(zhp, cb, name, l2cache[i], 2,
+ B_FALSE, NULL);
free(name);
}
}
@@ -7718,6 +7942,7 @@ status_callback(zpool_handle_t *zhp, void *data)
break;
case ZPOOL_STATUS_RESILVERING:
+ case ZPOOL_STATUS_REBUILDING:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices is "
"currently being resilvered. The pool will\n\tcontinue "
@@ -7727,6 +7952,16 @@ status_callback(zpool_handle_t *zhp, void *data)
"complete.\n"));
break;
+ case ZPOOL_STATUS_REBUILD_SCRUB:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices have "
+ "been sequentially resilvered, scrubbing\n\tthe pool "
+ "is recommended.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to "
+ "verify all data checksums.\n"));
+ break;
+
case ZPOOL_STATUS_CORRUPT_DATA:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices has "
@@ -7951,18 +8186,16 @@ status_callback(zpool_handle_t *zhp, void *data)
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
pool_checkpoint_stat_t *pcs = NULL;
- pool_scan_stat_t *ps = NULL;
pool_removal_stat_t *prs = NULL;
- (void) nvlist_lookup_uint64_array(nvroot,
- ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
- (void) nvlist_lookup_uint64_array(nvroot,
- ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
+ print_scan_status(zhp, nvroot);
+
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
- print_scan_status(ps);
- print_checkpoint_scan_warning(ps, pcs);
print_removal_status(zhp, prs);
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
print_checkpoint_status(pcs);
cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
@@ -7987,7 +8220,7 @@ status_callback(zpool_handle_t *zhp, void *data)
printf("\n");
print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
- B_FALSE);
+ B_FALSE, NULL);
print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
@@ -9543,6 +9776,36 @@ vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
return (bytes_remaining);
}
+/* Add up the total number of bytes left to rebuild across top-level vdevs */
+static uint64_t
+vdev_activity_top_remaining(nvlist_t *nv)
+{
+ uint64_t bytes_remaining = 0;
+ nvlist_t **child;
+ uint_t children;
+ int error;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (uint_t c = 0; c < children; c++) {
+ vdev_rebuild_stat_t *vrs;
+ uint_t i;
+
+ error = nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i);
+ if (error == 0) {
+ if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ bytes_remaining += (vrs->vrs_bytes_est -
+ vrs->vrs_bytes_rebuilt);
+ }
+ }
+ }
+
+ return (bytes_remaining);
+}
+
/* Whether any vdevs are 'spare' or 'replacing' vdevs */
static boolean_t
vdev_any_spare_replacing(nvlist_t *nv)
@@ -9652,6 +9915,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
bytes_rem[ZPOOL_WAIT_SCRUB] = rem;
else
bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
+ } else if (check_rebuilding(nvroot, NULL)) {
+ bytes_rem[ZPOOL_WAIT_RESILVER] =
+ vdev_activity_top_remaining(nvroot);
}
bytes_rem[ZPOOL_WAIT_INITIALIZE] =
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index ce748da18..ca38271cc 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -3507,7 +3507,16 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
ashift, NULL, 0, 0, 1);
- error = spa_vdev_attach(spa, oldguid, root, replacing);
+ /*
+ * When supported select either a healing or sequential resilver.
+ */
+ boolean_t rebuilding = B_FALSE;
+ if (pvd->vdev_ops == &vdev_mirror_ops ||
+ pvd->vdev_ops == &vdev_root_ops) {
+ rebuilding = !!ztest_random(2);
+ }
+
+ error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding);
nvlist_free(root);
@@ -3527,10 +3536,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
expected_error = error;
if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
- error == ZFS_ERR_DISCARDING_CHECKPOINT)
+ error == ZFS_ERR_DISCARDING_CHECKPOINT ||
+ error == ZFS_ERR_RESILVER_IN_PROGRESS ||
+ error == ZFS_ERR_REBUILD_IN_PROGRESS)
expected_error = error;
- /* XXX workaround 6690467 */
if (error != expected_error && expected_error != EBUSY) {
fatal(0, "attach (%s %llu, %s %llu, %d) "
"returned %d, expected %d",
diff --git a/configure.ac b/configure.ac
index e405ddb57..c7f813d19 100644
--- a/configure.ac
+++ b/configure.ac
@@ -368,7 +368,6 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/rename_dirs/Makefile
tests/zfs-tests/tests/functional/replacement/Makefile
tests/zfs-tests/tests/functional/reservation/Makefile
- tests/zfs-tests/tests/functional/resilver/Makefile
tests/zfs-tests/tests/functional/rootpool/Makefile
tests/zfs-tests/tests/functional/rsend/Makefile
tests/zfs-tests/tests/functional/scrub_mirror/Makefile
diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py
index 5c285164b..50dca67f3 100644
--- a/contrib/pyzfs/libzfs_core/_constants.py
+++ b/contrib/pyzfs/libzfs_core/_constants.py
@@ -95,6 +95,8 @@ zfs_errno = enum_with_offset(1024, [
'ZFS_ERR_EXPORT_IN_PROGRESS',
'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR',
'ZFS_ERR_STREAM_TRUNCATED',
+ 'ZFS_ERR_RESILVER_IN_PROGRESS',
+ 'ZFS_ERR_REBUILD_IN_PROGRESS',
],
{}
)
diff --git a/include/libzfs.h b/include/libzfs.h
index 64a0a2035..873e8f304 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -79,7 +79,7 @@ typedef enum zfs_error {
EZFS_NODEVICE, /* no such device in pool */
EZFS_BADDEV, /* invalid device to add */
EZFS_NOREPLICAS, /* no valid replicas */
- EZFS_RESILVERING, /* currently resilvering */
+ EZFS_RESILVERING, /* resilvering (healing reconstruction) */
EZFS_BADVERSION, /* unsupported version */
EZFS_POOLUNAVAIL, /* pool is currently unavailable */
EZFS_DEVOVERFLOW, /* too many devices in one vdev */
@@ -148,6 +148,7 @@ typedef enum zfs_error {
EZFS_TRIM_NOTSUP, /* device does not support trim */
EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */
EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */
+ EZFS_REBUILDING, /* resilvering (sequential reconstrution) */
EZFS_UNKNOWN
} zfs_error_t;
@@ -297,7 +298,7 @@ extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
extern int zpool_vdev_attach(zpool_handle_t *, const char *,
- const char *, nvlist_t *, int);
+ const char *, nvlist_t *, int, boolean_t);
extern int zpool_vdev_detach(zpool_handle_t *, const char *);
extern int zpool_vdev_remove(zpool_handle_t *, const char *);
extern int zpool_vdev_remove_cancel(zpool_handle_t *);
@@ -387,6 +388,8 @@ typedef enum {
ZPOOL_STATUS_RESILVERING, /* device being resilvered */
ZPOOL_STATUS_OFFLINE_DEV, /* device offline */
ZPOOL_STATUS_REMOVED_DEV, /* removed device */
+ ZPOOL_STATUS_REBUILDING, /* device being rebuilt */
+ ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */
/*
* Finally, the following indicates a healthy pool.
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index ce781aa4c..0659c6419 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -89,6 +89,7 @@ COMMON_H = \
vdev_initialize.h \
vdev_raidz.h \
vdev_raidz_impl.h \
+ vdev_rebuild.h \
vdev_removal.h \
vdev_trim.h \
xvattr.h \
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index bcb896da3..8f929207d 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -42,6 +42,8 @@ struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
+extern int zfs_scan_suspend_progress;
+
/*
* All members of this structure must be uint64_t, for byteswap
* purposes.
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 575a4af51..1bfd7a485 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -704,6 +704,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
#define ZPOOL_CONFIG_REMOVING "removing"
#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
+#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg"
#define ZPOOL_CONFIG_COMMENT "comment"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */
@@ -730,6 +731,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
#define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */
+#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats"
/*
* The persistent vdev state is stored as separate values rather than a single
@@ -778,6 +780,9 @@ typedef struct zpool_load_policy {
#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
"com.delphix:ms_unflushed_phys_txgs"
+#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \
+ "org.openzfs:vdev_rebuild"
+
#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
"org.zfsonlinux:allocation_bias"
@@ -991,6 +996,21 @@ typedef enum dsl_scan_state {
DSS_NUM_STATES
} dsl_scan_state_t;
+typedef struct vdev_rebuild_stat {
+ uint64_t vrs_state; /* vdev_rebuild_state_t */
+ uint64_t vrs_start_time; /* time_t */
+ uint64_t vrs_end_time; /* time_t */
+ uint64_t vrs_scan_time_ms; /* total run time (millisecs) */
+ uint64_t vrs_bytes_scanned; /* allocated bytes scanned */
+ uint64_t vrs_bytes_issued; /* read bytes issued */
+ uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */
+ uint64_t vrs_bytes_est; /* total bytes to scan */
+ uint64_t vrs_errors; /* scanning errors */
+ uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */
+ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
+ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
+} vdev_rebuild_stat_t;
+
/*
* Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER. The ordering
* of this enum must be maintained to ensure the errata identifiers map to
@@ -1047,6 +1067,7 @@ typedef struct vdev_stat {
uint64_t vs_trim_bytes_est; /* total bytes to trim */
uint64_t vs_trim_state; /* vdev_trim_state_t */
uint64_t vs_trim_action_time; /* time_t */
+ uint64_t vs_rebuild_processed; /* bytes rebuilt */
} vdev_stat_t;
/*
@@ -1178,6 +1199,13 @@ typedef enum {
VDEV_TRIM_COMPLETE,
} vdev_trim_state_t;
+typedef enum {
+ VDEV_REBUILD_NONE,
+ VDEV_REBUILD_ACTIVE,
+ VDEV_REBUILD_CANCELED,
+ VDEV_REBUILD_COMPLETE,
+} vdev_rebuild_state_t;
+
/*
* nvlist name constants. Facilitate restricting snapshot iteration range for
* the "list next snapshot" ioctl
@@ -1337,6 +1365,8 @@ typedef enum {
ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR,
ZFS_ERR_STREAM_TRUNCATED,
ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH,
+ ZFS_ERR_RESILVER_IN_PROGRESS,
+ ZFS_ERR_REBUILD_IN_PROGRESS,
} zfs_errno_t;
/*
@@ -1478,7 +1508,12 @@ typedef enum {
* given payloads:
*
* ESC_ZFS_RESILVER_START
- * ESC_ZFS_RESILVER_END
+ * ESC_ZFS_RESILVER_FINISH
+ *
+ * ZFS_EV_POOL_NAME DATA_TYPE_STRING
+ * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
+ * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING
+ *
* ESC_ZFS_POOL_DESTROY
* ESC_ZFS_POOL_REGUID
*
@@ -1532,6 +1567,7 @@ typedef enum {
#define ZFS_EV_HIST_IOCTL "history_ioctl"
#define ZFS_EV_HIST_DSNAME "history_dsname"
#define ZFS_EV_HIST_DSID "history_dsid"
+#define ZFS_EV_RESILVER_TYPE "resilver_type"
#ifdef __cplusplus
}
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 5806dda41..9b96eb1f8 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -790,17 +790,12 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
#define SPA_ASYNC_L2CACHE_REBUILD 0x800
#define SPA_ASYNC_L2CACHE_TRIM 0x1000
-
-/*
- * Controls the behavior of spa_vdev_remove().
- */
-#define SPA_REMOVE_UNSPARE 0x01
-#define SPA_REMOVE_DONE 0x02
+#define SPA_ASYNC_REBUILD_DONE 0x2000
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
- int replacing);
+ int replacing, int rebuild);
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
@@ -988,6 +983,7 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
extern uint64_t spa_vdev_config_enter(spa_t *spa);
extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
int error, char *tag);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 6481d5397..2c52cb666 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -36,6 +36,7 @@
#include <sys/spa_checkpoint.h>
#include <sys/spa_log_spacemap.h>
#include <sys/vdev.h>
+#include <sys/vdev_rebuild.h>
#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
#include <sys/dmu.h>
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index d93ef78f1..a7e880636 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -73,7 +73,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
- int scrub_done);
+ boolean_t scrub_done, boolean_t rebuild_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 56407a191..b9298c62d 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -38,6 +38,7 @@
#include <sys/uberblock_impl.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_rebuild.h>
#include <sys/vdev_removal.h>
#include <sys/zfs_ratelimit.h>
@@ -295,13 +296,26 @@ struct vdev {
uint64_t vdev_trim_secure; /* requested secure TRIM */
uint64_t vdev_trim_action_time; /* start and end time */
- /* for limiting outstanding I/Os (initialize and TRIM) */
+ /* Rebuild related */
+ boolean_t vdev_rebuilding;
+ boolean_t vdev_rebuild_exit_wanted;
+ boolean_t vdev_rebuild_cancel_wanted;
+ boolean_t vdev_rebuild_reset_wanted;
+ kmutex_t vdev_rebuild_lock;
+ kcondvar_t vdev_rebuild_cv;
+ kthread_t *vdev_rebuild_thread;
+ vdev_rebuild_t vdev_rebuild_config;
+
+ /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */
kmutex_t vdev_initialize_io_lock;
kcondvar_t vdev_initialize_io_cv;
uint64_t vdev_initialize_inflight;
kmutex_t vdev_trim_io_lock;
kcondvar_t vdev_trim_io_cv;
uint64_t vdev_trim_inflight[3];
+ kmutex_t vdev_rebuild_io_lock;
+ kcondvar_t vdev_rebuild_io_cv;
+ uint64_t vdev_rebuild_inflight;
/*
* Values stored in the config for an indirect or removing vdev.
@@ -358,6 +372,7 @@ struct vdev {
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
uint64_t vdev_resilver_txg; /* persistent resilvering state */
+ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h
new file mode 100644
index 000000000..3d4b8cc46
--- /dev/null
+++ b/include/sys/vdev_rebuild.h
@@ -0,0 +1,97 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SYS_VDEV_REBUILD_H
+#define _SYS_VDEV_REBUILD_H
+
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Number of entries in the physical vdev_rebuild_phys structure. This
+ * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS.
+ */
+#define REBUILD_PHYS_ENTRIES 12
+
+/*
+ * On-disk rebuild configuration and state. When adding new fields they
+ * must be added to the end of the structure.
+ */
+typedef struct vdev_rebuild_phys {
+ uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */
+ uint64_t vrp_last_offset; /* last rebuilt offset */
+ uint64_t vrp_min_txg; /* minimum missing txg */
+ uint64_t vrp_max_txg; /* maximum missing txg */
+ uint64_t vrp_start_time; /* start time */
+ uint64_t vrp_end_time; /* end time */
+ uint64_t vrp_scan_time_ms; /* total run time in ms */
+ uint64_t vrp_bytes_scanned; /* alloc bytes scanned */
+ uint64_t vrp_bytes_issued; /* read bytes rebuilt */
+ uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */
+ uint64_t vrp_bytes_est; /* total bytes to scan */
+ uint64_t vrp_errors; /* errors during rebuild */
+} vdev_rebuild_phys_t;
+
+/*
+ * The vdev_rebuild_t describes the current state and how a top-level vdev
+ * should be rebuilt. The core elements are the top-vdev, the metaslab being
+ * rebuilt, range tree containing the allocted extents and the on-disk state.
+ */
+typedef struct vdev_rebuild {
+ vdev_t *vr_top_vdev; /* top-level vdev to rebuild */
+ metaslab_t *vr_scan_msp; /* scanning disabled metaslab */
+ range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */
+
+ /* In-core state and progress */
+ uint64_t vr_scan_offset[TXG_SIZE];
+ uint64_t vr_prev_scan_time_ms; /* any previous scan time */
+
+ /* Per-rebuild pass statistics for calculating bandwidth */
+ uint64_t vr_pass_start_time;
+ uint64_t vr_pass_bytes_scanned;
+ uint64_t vr_pass_bytes_issued;
+
+ /* On-disk state updated by vdev_rebuild_zap_update_sync() */
+ vdev_rebuild_phys_t vr_rebuild_phys;
+} vdev_rebuild_t;
+
+boolean_t vdev_rebuild_active(vdev_t *);
+
+int vdev_rebuild_load(vdev_t *);
+void vdev_rebuild(vdev_t *);
+void vdev_rebuild_stop_wait(vdev_t *);
+void vdev_rebuild_stop_all(spa_t *);
+void vdev_rebuild_restart(spa_t *);
+void vdev_rebuild_clear_sync(void *, dmu_tx_t *);
+int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_REBUILD_H */
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index 0b422904e..2d8e7fc36 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -31,6 +31,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_TRIM, /* trim I/O (discard) */
+ ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 2d8767d5b..7e19a62e2 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -74,6 +74,7 @@ typedef enum spa_feature {
SPA_FEATURE_BOOKMARK_WRITTEN,
SPA_FEATURE_LOG_SPACEMAP,
SPA_FEATURE_LIVELIST,
+ SPA_FEATURE_DEVICE_REBUILD,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 11b3d4cd9..f848cb3cf 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2446,7 +2446,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
- if (ps && ps->pss_func == POOL_SCAN_SCRUB) {
+ if (ps && ps->pss_func == POOL_SCAN_SCRUB &&
+ ps->pss_state == DSS_SCANNING) {
if (cmd == POOL_SCRUB_PAUSE)
return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
else
@@ -3128,8 +3129,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
* If 'replacing' is specified, the new disk will replace the old one.
*/
int
-zpool_vdev_attach(zpool_handle_t *zhp,
- const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
+zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
+ const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild)
{
zfs_cmd_t zc = {"\0"};
char msg[1024];
@@ -3164,6 +3165,14 @@ zpool_vdev_attach(zpool_handle_t *zhp,
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
zc.zc_cookie = replacing;
+ zc.zc_simple = rebuild;
+
+ if (rebuild &&
+ zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "the loaded zfs module doesn't support device rebuilds"));
+ return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
+ }
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0 || children != 1) {
@@ -3224,16 +3233,21 @@ zpool_vdev_attach(zpool_handle_t *zhp,
uint64_t version = zpool_get_prop_int(zhp,
ZPOOL_PROP_VERSION, NULL);
- if (islog)
+ if (islog) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a log with a spare"));
- else if (version >= SPA_VERSION_MULTI_REPLACE)
+ } else if (rebuild) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "only mirror vdevs support sequential "
+ "reconstruction"));
+ } else if (version >= SPA_VERSION_MULTI_REPLACE) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"already in replacing/spare config; wait "
"for completion or use 'zpool detach'"));
- else
+ } else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a replacing device"));
+ }
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only attach to mirrors and top-level "
diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c
index ebf497db6..67b8ea33e 100644
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@@ -84,6 +84,8 @@ static char *zfs_msgid_table[] = {
* ZPOOL_STATUS_RESILVERING
* ZPOOL_STATUS_OFFLINE_DEV
* ZPOOL_STATUS_REMOVED_DEV
+ * ZPOOL_STATUS_REBUILDING
+ * ZPOOL_STATUS_REBUILD_SCRUB
* ZPOOL_STATUS_OK
*/
};
@@ -195,7 +197,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
* - Check for any data errors
* - Check for any faulted or missing devices in a replicated config
* - Look for any devices showing errors
- * - Check for any resilvering devices
+ * - Check for any resilvering or rebuilding devices
*
* There can obviously be multiple errors within a single pool, so this routine
* only picks the most damaging of all the current errors to report.
@@ -234,6 +236,49 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
return (ZPOOL_STATUS_RESILVERING);
/*
+ * Currently rebuilding a vdev, check top-level vdevs.
+ */
+ vdev_rebuild_stat_t *vrs = NULL;
+ nvlist_t **child;
+ uint_t c, i, children;
+ uint64_t rebuild_end_time = 0;
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if ((nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS,
+ (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) {
+ uint64_t state = vrs->vrs_state;
+
+ if (state == VDEV_REBUILD_ACTIVE) {
+ return (ZPOOL_STATUS_REBUILDING);
+ } else if (state == VDEV_REBUILD_COMPLETE &&
+ vrs->vrs_end_time > rebuild_end_time) {
+ rebuild_end_time = vrs->vrs_end_time;
+ }
+ }
+ }
+
+ /*
+ * If we can determine when the last scrub was run, and it
+ * was before the last rebuild completed, then recommend
+ * that the pool be scrubbed to verify all checksums. When
+ * ps is NULL we can infer the pool has never been scrubbed.
+ */
+ if (rebuild_end_time > 0) {
+ if (ps != NULL) {
+ if ((ps->pss_state == DSS_FINISHED &&
+ ps->pss_func == POOL_SCAN_SCRUB &&
+ rebuild_end_time > ps->pss_end_time) ||
+ ps->pss_state == DSS_NONE)
+ return (ZPOOL_STATUS_REBUILD_SCRUB);
+ } else {
+ return (ZPOOL_STATUS_REBUILD_SCRUB);
+ }
+ }
+ }
+
+ /*
* The multihost property is set and the pool may be active.
*/
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 21bd8289c..2f4aaed32 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -286,6 +286,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
"resilver_defer feature"));
case EZFS_EXPORT_IN_PROGRESS:
return (dgettext(TEXT_DOMAIN, "pool export in progress"));
+ case EZFS_REBUILDING:
+ return (dgettext(TEXT_DOMAIN, "currently sequentially "
+ "resilvering"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
@@ -693,6 +696,12 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_EXPORT_IN_PROGRESS:
zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap);
break;
+ case ZFS_ERR_RESILVER_IN_PROGRESS:
+ zfs_verror(hdl, EZFS_RESILVERING, fmt, ap);
+ break;
+ case ZFS_ERR_REBUILD_IN_PROGRESS:
+ zfs_verror(hdl, EZFS_REBUILDING, fmt, ap);
+ break;
case ZFS_ERR_IOC_CMD_UNAVAIL:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
"module does not support this operation. A reboot may "
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 46befa7d4..06b89fe0a 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -132,6 +132,7 @@ KERNEL_C = \
vdev_raidz_math_sse2.c \
vdev_raidz_math_ssse3.c \
vdev_raidz_math_powerpc_altivec.c \
+ vdev_rebuild.c \
vdev_removal.c \
vdev_root.c \
vdev_trim.c \
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 687b85d0b..3fbd3c67f 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1865,6 +1865,30 @@ Default value: \fB1,000\fR.
.sp
.ne 2
.na
+\fBzfs_vdev_rebuild_max_active\fR (int)
+.ad
+.RS 12n
+Maximum sequential resilver I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB3\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_rebuild_min_active\fR (int)
+.ad
+.RS 12n
+Minimum sequential resilver I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_vdev_removal_max_active\fR (int)
.ad
.RS 12n
@@ -2710,6 +2734,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp
.ne 2
.na
+\fBzfs_rebuild_max_segment\fR (ulong)
+.ad
+.RS 12n
+Maximum read segment size to issue when sequentially resilvering a
+top-level vdev.
+.sp
+Default value: \fB1,048,576\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_reconstruct_indirect_combinations_max\fR (int)
.ad
.RS 12na
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
index e7a61957f..3f690c334 100644
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@@ -258,6 +258,35 @@ returned to the \fBenabled\fR state when all bookmarks with these fields are des
.sp
.ne 2
.na
+\fBdevice_rebuild\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID org.openzfs:device_rebuild
+READ\-ONLY COMPATIBLE yes
+DEPENDENCIES none
+.TE
+
+This feature enables the ability for the \fBzpool attach\fR and \fBzpool
+replace\fR subcommands to perform sequential reconstruction (instead of
+healing reconstruction) when resilvering.
+
+Sequential reconstruction resilvers a device in LBA order without immediately
+verifying the checksums. Once complete a scrub is started which then verifies
+the checksums. This approach allows full redundancy to be restored to the pool
+in the minimum amount of time. This two phase approach will take longer than a
+healing resilver when the time to verify the checksums is included. However,
+unless there is additional pool damage no checksum errors should be reported
+by the scrub. This feature is incompatible with raidz configurations.
+
+This feature becomes \fBactive\fR while a sequential resilver is in progress,
+and returns to \fBenabled\fR when the resilver completes.
+.RE
+
+.sp
+.ne 2
+.na
\fBdevice_removal\fR
.ad
.RS 4n
diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8
index be0be4e07..585357b96 100644
--- a/man/man8/zpool-attach.8
+++ b/man/man8/zpool-attach.8
@@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
-.Dd August 9, 2019
+.Dd May 15, 2020
.Dt ZPOOL-ATTACH 8
.Os Linux
.Sh NAME
@@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm
.Cm attach
-.Op Fl fw
+.Op Fl fsw
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool device new_device
.Sh DESCRIPTION
@@ -44,7 +44,7 @@
.It Xo
.Nm
.Cm attach
-.Op Fl fw
+.Op Fl fsw
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool device new_device
.Xc
@@ -68,22 +68,29 @@ is part of a two-way mirror, attaching
creates a three-way mirror, and so on.
In either case,
.Ar new_device
-begins to resilver immediately.
+begins to resilver immediately and any running scrub is cancelled.
.Bl -tag -width Ds
.It Fl f
Forces use of
.Ar new_device ,
even if it appears to be in use.
Not all devices can be overridden in this manner.
-.It Fl w
-Waits until
-.Ar new_device
-has finished resilvering before returning.
.It Fl o Ar property Ns = Ns Ar value
Sets the given pool properties. See the
.Xr zpoolprops 8
manual page for a list of valid properties that can be set. The only property
supported at the moment is ashift.
+.It Fl s
+The
+.Ar new_device
+is reconstructed sequentially to restore redundancy as quickly as possible.
+Checksums are not verfied during sequential reconstruction so a scrub is
+started when the resilver completes.
+Sequential reconstruction is not supported for raidz configurations.
+.It Fl w
+Waits until
+.Ar new_device
+has finished resilvering before returning.
.El
.El
.Sh SEE ALSO
diff --git a/man/man8/zpool-replace.8 b/man/man8/zpool-replace.8
index 933fb4ae9..5e639feaf 100644
--- a/man/man8/zpool-replace.8
+++ b/man/man8/zpool-replace.8
@@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
-.Dd August 9, 2019
+.Dd May 15, 2020
.Dt ZPOOL-REPLACE 8
.Os Linux
.Sh NAME
@@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm
.Cm replace
-.Op Fl fw
+.Op Fl fsw
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool Ar device Op Ar new_device
.Sh DESCRIPTION
@@ -44,7 +44,7 @@
.It Xo
.Nm
.Cm replace
-.Op Fl fw
+.Op Fl fsw
.Op Fl o Ar property Ns = Ns Ar value
.Ar pool Ar device Op Ar new_device
.Xc
@@ -56,6 +56,7 @@ This is equivalent to attaching
.Ar new_device ,
waiting for it to resilver, and then detaching
.Ar old_device .
+Any in progress scrub will be cancelled.
.Pp
The size of
.Ar new_device
@@ -86,6 +87,13 @@ Sets the given pool properties. See the
manual page for a list of valid properties that can be set.
The only property supported at the moment is
.Sy ashift .
+.It Fl s
+The
+.Ar new_device
+is reconstructed sequentially to restore redundancy as quickly as possible.
+Checksums are not verfied during sequential reconstruction so a scrub is
+started when the resilver completes.
+Sequential reconstruction is not supported for raidz configurations.
.It Fl w
Waits until the replacement has completed before returning.
.El
diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8
index 7364bf635..66e335995 100644
--- a/man/man8/zpool-status.8
+++ b/man/man8/zpool-status.8
@@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
-.Dd August 9, 2019
+.Dd May 15, 2020
.Dt ZPOOL-STATUS 8
.Os Linux
.Sh NAME
@@ -59,7 +59,7 @@ is specified, then the status of each pool in the system is displayed.
For more information on pool and device health, see the
.Em Device Failure and Recovery
section of
-.Xr zpoolconcepts 8.
+.Xr zpoolconcepts 8 .
.Pp
If a scrub or resilver is in progress, this command reports the percentage done
and the estimated time to completion.
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 7c83113ac..1ac9d00e7 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -251,6 +251,7 @@ SRCS+= abd.c \
vdev_raidz.c \
vdev_raidz_math.c \
vdev_raidz_math_scalar.c \
+ vdev_rebuild.c \
vdev_raidz_math_avx2.c \
vdev_raidz_math_avx512bw.c \
vdev_raidz_math_avx512f.c \
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index cf3006721..302d48570 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -570,6 +570,11 @@ zpool_feature_init(void)
"com.datto:resilver_defer", "resilver_defer",
"Support for deferring new resilvers when one is already running.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+ zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
+ "org.openzfs:device_rebuild", "device_rebuild",
+ "Support for sequential device rebuilds",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
}
#if defined(_KERNEL)
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 7ea976d12..9ddcd6c33 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o
$(MODULE)-objs += vdev_raidz.o
$(MODULE)-objs += vdev_raidz_math.o
$(MODULE)-objs += vdev_raidz_math_scalar.o
+$(MODULE)-objs += vdev_rebuild.o
$(MODULE)-objs += vdev_removal.o
$(MODULE)-objs += vdev_root.o
$(MODULE)-objs += vdev_trim.o
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 895ffbf0a..712af664e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -704,8 +704,9 @@ static int
dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
- if (dsl_scan_is_running(scn))
+ if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
return (SET_ERROR(EBUSY));
return (0);
@@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
if (vdev_resilver_needed(spa->spa_root_vdev,
&scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
- spa_event_notify(spa, NULL, NULL,
+ nvlist_t *aux = fnvlist_alloc();
+ fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+ "healing");
+ spa_event_notify(spa, NULL, aux,
ESC_ZFS_RESILVER_START);
+ nvlist_free(aux);
} else {
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
}
@@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+ /*
+ * When starting a resilver clear any existing rebuild state.
+ * This is required to prevent stale rebuild status from
+ * being reported when a rebuild is run, then a resilver and
+ * finally a scrub. In which case only the scrub status
+ * should be reported by 'zpool status'.
+ */
+ if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+ vdev_rebuild_clear_sync(
+ (void *)(uintptr_t)vd->vdev_id, tx);
+ }
+ }
}
/* back to the generic stuff */
@@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
if (complete &&
!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
- scn->scn_phys.scn_max_txg, B_TRUE);
-
- spa_event_notify(spa, NULL, NULL,
- scn->scn_phys.scn_min_txg ?
- ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
+
+ if (scn->scn_phys.scn_min_txg) {
+ nvlist_t *aux = fnvlist_alloc();
+ fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+ "healing");
+ spa_event_notify(spa, NULL, aux,
+ ESC_ZFS_RESILVER_FINISH);
+ nvlist_free(aux);
+ } else {
+ spa_event_notify(spa, NULL, NULL,
+ ESC_ZFS_SCRUB_FINISH);
+ }
} else {
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
- 0, B_TRUE);
+ 0, B_TRUE, B_FALSE);
}
spa_errlog_rotate(spa);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 943330886..6b60227d2 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -57,6 +57,7 @@
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_initialize.h>
+#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
#include <sys/metaslab.h>
@@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa)
vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
+ vdev_rebuild_stop_all(spa);
}
/*
@@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
* Propagate the leaf DTLs we just loaded all the way up the vdev tree.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
spa_config_exit(spa, SCL_ALL, FTAG);
return (0);
@@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
update_config_cache);
/*
- * Check all DTLs to see if anything needs resilvering.
+ * Check if a rebuild was in progress and if so resume it.
+ * Then check all DTLs to see if anything needs resilvering.
+ * The resilver will be deferred if a rebuild was started.
*/
- if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
- vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ if (vdev_rebuild_active(spa->spa_root_vdev)) {
+ vdev_rebuild_restart(spa);
+ } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
/*
* Log the fact that we booted up (so that we can detect if
@@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
+ vdev_rebuild_stop_all(spa);
}
/*
@@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* extra rules: you can't attach to it after it's been created, and upon
* completion of resilvering, the first disk (the one being replaced)
* is automatically detached.
+ *
+ * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
+ * should be performed instead of traditional healing reconstruction. From
+ * an administrators perspective these are both resilver operations.
*/
int
-spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
+ int rebuild)
{
uint64_t txg, dtl_max_txg;
- vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
+ vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
@@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}
+ if (rebuild) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ if (dsl_scan_resilvering(spa_get_dsl(spa)))
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_RESILVER_IN_PROGRESS));
+ } else {
+ if (vdev_rebuild_active(rvd))
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_REBUILD_IN_PROGRESS));
+ }
+
if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
@@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ if (rebuild) {
+ /*
+ * For rebuilds, the parent vdev must support reconstruction
+ * using only space maps. This means the only allowable
+ * parents are the root vdev or a mirror vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+ }
+
if (!replacing) {
/*
* For attach, the only allowable parent is a mirror or the root
@@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* than the top-level vdev.
*/
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
- return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* If this is an in-place replacement, update oldvd's path and devid
@@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
}
- /* mark the device being resilvered */
- newvd->vdev_resilver_txg = txg;
-
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
@@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
- dtl_max_txg - TXG_INITIAL);
+ vdev_dtl_dirty(newvd, DTL_MISSING,
+ TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
@@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_dirty(tvd, VDD_DTL, newvd, txg);
/*
- * Schedule the resilver to restart in the future. We do this to
- * ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets. We do not do this if resilvers have been
- * deferred.
+ * Schedule the resilver or rebuild to restart in the future. We do
+ * this to ensure that dmu_sync-ed blocks have been stitched into the
+ * respective datasets.
*/
- if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
- spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
- vdev_defer_resilver(newvd);
- else
- dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg);
+ if (rebuild) {
+ newvd->vdev_rebuild_txg = txg;
+
+ vdev_rebuild(tvd);
+ } else {
+ newvd->vdev_resilver_txg = txg;
+
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ vdev_defer_resilver(newvd);
+ } else {
+ dsl_scan_restart_resilver(spa->spa_dsl_pool,
+ dtl_max_txg);
+ }
+ }
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
ASSERT(spa_writeable(spa));
- txg = spa_vdev_enter(spa);
+ txg = spa_vdev_detach_enter(spa, guid);
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa)
}
spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * If a detach was not performed above replace waiters will not have
+ * been notified. In which case we must do so now.
+ */
+ spa_notify_waiters(spa);
}
/*
@@ -7971,9 +8021,21 @@ spa_async_thread(void *arg)
spa_vdev_resilver_done(spa);
/*
+ * If any devices are done replacing, detach them. Then if no
+ * top-level vdevs are rebuilding attempt to kick off a scrub.
+ */
+ if (tasks & SPA_ASYNC_REBUILD_DONE) {
+ spa_vdev_resilver_done(spa);
+
+ if (!vdev_rebuild_active(spa->spa_root_vdev))
+ (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
+ }
+
+ /*
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER &&
+ !vdev_rebuild_active(spa->spa_root_vdev) &&
(!dsl_scan_resilvering(dp) ||
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
dsl_scan_restart_resilver(dp, 0);
@@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
DSS_SCANNING);
break;
case ZPOOL_WAIT_RESILVER:
+ if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+ break;
+ /* fall through */
case ZPOOL_WAIT_SCRUB:
{
boolean_t scanning, paused, is_scrub;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 61cefa3dd..4c884409a 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1166,6 +1166,30 @@ spa_vdev_enter(spa_t *spa)
}
/*
+ * The same as spa_vdev_enter() above but additionally takes the guid of
+ * the vdev being detached. When there is a rebuild in process it will be
+ * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
+ * The rebuild is canceled if only a single child remains after the detach.
+ */
+uint64_t
+spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
+{
+ mutex_enter(&spa->spa_vdev_top_lock);
+ mutex_enter(&spa_namespace_lock);
+
+ vdev_autotrim_stop_all(spa);
+
+ if (guid != 0) {
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd) {
+ vdev_rebuild_stop_wait(vd->vdev_top);
+ }
+ }
+
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
* Internal implementation for spa_vdev_enter(). Used when a vdev
* operation requires multiple syncs (i.e. removing a device) while
* keeping the spa_namespace_lock held.
@@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
/*
* Reassess the DTLs.
*/
- vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
config_changed = B_TRUE;
@@ -1271,6 +1295,7 @@ int
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
vdev_autotrim_restart(spa);
+ vdev_rebuild_restart(spa);
spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
}
if (vd != NULL || error == 0)
- vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE);
+ vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
if (vd != NULL) {
if (vd != spa->spa_root_vdev)
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 03360120a..27ac17fea 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -39,6 +39,7 @@
#include <sys/dmu_tx.h>
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_rebuild.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
+
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
0);
}
+
txg_list_create(&vd->vdev_ms_list, spa,
offsetof(struct metaslab, ms_txg_node));
txg_list_create(&vd->vdev_dtl_list, spa,
@@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+ &vd->vdev_rebuild_txg);
+
if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
vdev_defer_resilver(vd);
@@ -890,6 +902,7 @@ vdev_free(vdev_t *vd)
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
@@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
+
mutex_destroy(&vd->vdev_initialize_lock);
mutex_destroy(&vd->vdev_initialize_io_lock);
cv_destroy(&vd->vdev_initialize_io_cv);
cv_destroy(&vd->vdev_initialize_cv);
+
mutex_destroy(&vd->vdev_trim_lock);
mutex_destroy(&vd->vdev_autotrim_lock);
mutex_destroy(&vd->vdev_trim_io_lock);
@@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_autotrim_cv);
cv_destroy(&vd->vdev_trim_io_cv);
+ mutex_destroy(&vd->vdev_rebuild_lock);
+ mutex_destroy(&vd->vdev_rebuild_io_lock);
+ cv_destroy(&vd->vdev_rebuild_cv);
+ cv_destroy(&vd->vdev_rebuild_io_cv);
+
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
ASSERT0(tvd->vdev_removing);
+ ASSERT0(tvd->vdev_rebuilding);
tvd->vdev_removing = svd->vdev_removing;
+ tvd->vdev_rebuilding = svd->vdev_rebuilding;
+ tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
tvd->vdev_indirect_config = svd->vdev_indirect_config;
tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
tvd->vdev_indirect_births = svd->vdev_indirect_births;
@@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_indirect_births = NULL;
svd->vdev_obsolete_sm = NULL;
svd->vdev_removing = 0;
+ svd->vdev_rebuilding = 0;
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
@@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd)
* excise the DTLs.
*/
static boolean_t
-vdev_dtl_should_excise(vdev_t *vd)
+vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
{
- spa_t *spa = vd->vdev_spa;
- dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
-
ASSERT0(vd->vdev_children);
if (vd->vdev_state < VDEV_STATE_DEGRADED)
@@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd)
if (vd->vdev_resilver_deferred)
return (B_FALSE);
- if (vd->vdev_resilver_txg == 0 ||
- range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
+ if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
- /*
- * When a resilver is initiated the scan will assign the scn_max_txg
- * value to the highest txg value that exists in all DTLs. If this
- * device's max DTL is not part of this scan (i.e. it is not in
- * the range (scn_min_txg, scn_max_txg] then it is not eligible
- * for excision.
- */
- if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
- ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
- ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
- ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
- return (B_TRUE);
+ if (rebuild_done) {
+ vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ /* Rebuild not initiated by attach */
+ if (vd->vdev_rebuild_txg == 0)
+ return (B_TRUE);
+
+ /*
+ * When a rebuild completes without error then all missing data
+ * up to the rebuild max txg has been reconstructed and the DTL
+ * is eligible for excision.
+ */
+ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
+ vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
+ ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
+ ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
+ return (B_TRUE);
+ }
+ } else {
+ dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+ dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
+
+ /* Resilver not initiated by attach */
+ if (vd->vdev_resilver_txg == 0)
+ return (B_TRUE);
+
+ /*
+ * When a resilver is initiated the scan will assign the
+ * scn_max_txg value to the highest txg value that exists
+ * in all DTLs. If this device's max DTL is not part of this
+ * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
+ * then it is not eligible for excision.
+ */
+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+ ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
+ ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
+ return (B_TRUE);
+ }
}
+
return (B_FALSE);
}
@@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd)
* write operations will be issued to the pool.
*/
void
-vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ boolean_t scrub_done, boolean_t rebuild_done)
{
spa_t *spa = vd->vdev_spa;
avl_tree_t reftree;
@@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
for (int c = 0; c < vd->vdev_children; c++)
vdev_dtl_reassess(vd->vdev_child[c], txg,
- scrub_txg, scrub_done);
+ scrub_txg, scrub_done, rebuild_done);
if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+ boolean_t check_excise = B_FALSE;
boolean_t wasempty = B_TRUE;
mutex_enter(&vd->vdev_dtl_lock);
/*
- * If requested, pretend the scan completed cleanly.
+ * If requested, pretend the scan or rebuild completed cleanly.
*/
- if (zfs_scan_ignore_errors && scn)
- scn->scn_phys.scn_errors = 0;
+ if (zfs_scan_ignore_errors) {
+ if (scn != NULL)
+ scn->scn_phys.scn_errors = 0;
+ if (vr != NULL)
+ vr->vr_rebuild_phys.vrp_errors = 0;
+ }
if (scrub_txg != 0 &&
!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
@@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
}
/*
- * If we've completed a scan cleanly then determine
- * if this vdev should remove any DTLs. We only want to
- * excise regions on vdevs that were available during
- * the entire duration of this scan.
+ * If we've completed a scrub/resilver or a rebuild cleanly
+ * then determine if this vdev should remove any DTLs. We
+ * only want to excise regions on vdevs that were available
+ * during the entire duration of this scan.
*/
- if (scrub_txg != 0 &&
- (spa->spa_scrub_started ||
- (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
- vdev_dtl_should_excise(vd)) {
+ if (rebuild_done &&
+ vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
+ check_excise = B_TRUE;
+ } else {
+ if (spa->spa_scrub_started ||
+ (scn != NULL && scn->scn_phys.scn_errors == 0)) {
+ check_excise = B_TRUE;
+ }
+ }
+
+ if (scrub_txg && check_excise &&
+ vdev_dtl_should_excise(vd, rebuild_done)) {
/*
- * We completed a scrub up to scrub_txg. If we
- * did it without rebooting, then the scrub dtl
- * will be valid, so excise the old region and
- * fold in the scrub dtl. Otherwise, leave the
- * dtl as-is if there was an error.
+ * We completed a scrub, resilver or rebuild up to
+ * scrub_txg. If we did it without rebooting, then
+ * the scrub dtl will be valid, so excise the old
+ * region and fold in the scrub dtl. Otherwise,
+ * leave the dtl as-is if there was an error.
*
* There's little trick here: to excise the beginning
* of the DTL_MISSING map, we put it into a reference
@@ -2711,15 +2776,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
/*
- * If the vdev was resilvering and no longer has any
- * DTLs then reset its resilvering flag and dirty
+ * If the vdev was resilvering or rebuilding and no longer
+ * has any DTLs then reset the appropriate flag and dirty
* the top level so that we persist the change.
*/
- if (txg != 0 && vd->vdev_resilver_txg != 0 &&
+ if (txg != 0 &&
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
- vd->vdev_resilver_txg = 0;
- vdev_config_dirty(vd->vdev_top);
+ if (vd->vdev_rebuild_txg != 0) {
+ vd->vdev_rebuild_txg = 0;
+ vdev_config_dirty(vd->vdev_top);
+ } else if (vd->vdev_resilver_txg != 0) {
+ vd->vdev_resilver_txg = 0;
+ vdev_config_dirty(vd->vdev_top);
+ }
}
mutex_exit(&vd->vdev_dtl_lock);
@@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd)
* If not, we can safely offline/detach/remove the device.
*/
vd->vdev_cant_read = B_TRUE;
- vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
vd->vdev_cant_read = cant_read;
- vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
if (!required && zio_injection_enabled) {
required = !!zio_handle_device_injection(vd, NULL,
@@ -3066,6 +3136,20 @@ vdev_load(vdev_t *vd)
}
/*
+ * Load any rebuild state from the top-level vdev zap.
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ error = vdev_rebuild_load(vd);
+ if (error && error != ENOTSUP) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
+ "failed [error=%d]", error);
+ return (error);
+ }
+ }
+
+ /*
* If this is a top-level vdev, initialize its metaslabs.
*/
if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
@@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
+
if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
@@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
vs->vs_trim_state = vd->vdev_trim_state;
vs->vs_trim_action_time = vd->vdev_trim_action_time;
+
+ /* Set when there is a deferred resilver. */
+ vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
+
/*
* Report expandable space on top-level, non-auxiliary devices
* only. The expandable space is reported in terms of metaslab
@@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vd->vdev_max_asize - vd->vdev_asize,
1ULL << tvd->vdev_ms_shift);
}
+
+ /*
+ * Report fragmentation and rebuild progress for top-level,
+ * non-auxiliary, concrete devices.
+ */
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
- if (vd->vdev_ops->vdev_op_leaf)
- vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
vdev_get_stats_ex_impl(vd, vs, vsx);
@@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
+ /*
+ * Repair is the result of a resilver issued by the
+ * scan thread (spa_sync).
+ */
if (flags & ZIO_FLAG_SCAN_THREAD) {
- dsl_scan_phys_t *scn_phys =
- &spa->spa_dsl_pool->dp_scan->scn_phys;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ dsl_scan_phys_t *scn_phys = &scn->scn_phys;
uint64_t *processed = &scn_phys->scn_processed;
- /* XXX cleanup? */
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(processed, psize);
vs->vs_scan_processed += psize;
}
+ /*
+ * Repair is the result of a rebuild issued by the
+ * rebuild thread (vdev_rebuild_thread).
+ */
+ if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
+ vdev_t *tvd = vd->vdev_top;
+ vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(rebuilt, psize);
+ vs->vs_rebuild_processed += psize;
+ }
+
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
zio_type_t vs_type = type;
+ zio_priority_t priority = zio->io_priority;
/*
* TRIM ops and bytes are reported to user space as
@@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_TRIM)
vs_type = ZIO_TYPE_IOCTL;
+ /*
+ * Solely for the purposes of 'zpool iostat -lqrw'
+ * reporting use the priority to catagorize the IO.
+ * Only the following are reported to user space:
+ *
+ * ZIO_PRIORITY_SYNC_READ,
+ * ZIO_PRIORITY_SYNC_WRITE,
+ * ZIO_PRIORITY_ASYNC_READ,
+ * ZIO_PRIORITY_ASYNC_WRITE,
+ * ZIO_PRIORITY_SCRUB,
+ * ZIO_PRIORITY_TRIM.
+ */
+ if (priority == ZIO_PRIORITY_REBUILD) {
+ priority = ((type == ZIO_TYPE_WRITE) ?
+ ZIO_PRIORITY_ASYNC_WRITE :
+ ZIO_PRIORITY_SCRUB);
+ } else if (priority == ZIO_PRIORITY_INITIALIZING) {
+ ASSERT3U(type, ==, ZIO_TYPE_WRITE);
+ priority = ZIO_PRIORITY_ASYNC_WRITE;
+ } else if (priority == ZIO_PRIORITY_REMOVAL) {
+ priority = ((type == ZIO_TYPE_WRITE) ?
+ ZIO_PRIORITY_ASYNC_WRITE :
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
vs->vs_ops[vs_type]++;
vs->vs_bytes[vs_type] += psize;
if (flags & ZIO_FLAG_DELEGATED) {
- vsx->vsx_agg_histo[zio->io_priority]
+ vsx->vsx_agg_histo[priority]
[RQ_HISTO(zio->io_size)]++;
} else {
- vsx->vsx_ind_histo[zio->io_priority]
+ vsx->vsx_ind_histo[priority]
[RQ_HISTO(zio->io_size)]++;
}
if (zio->io_delta && zio->io_delay) {
- vsx->vsx_queue_histo[zio->io_priority]
+ vsx->vsx_queue_histo[priority]
[L_HISTO(zio->io_delta - zio->io_delay)]++;
vsx->vsx_disk_histo[type]
[L_HISTO(zio->io_delay)]++;
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 81cfd5cce..8c7468255 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
}
}
+static void
+top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+ if (vd == vd->vdev_top) {
+ vdev_rebuild_stat_t vrs;
+ if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
+ sizeof (vrs) / sizeof (uint64_t));
+ }
+ }
+}
+
/*
* Generate the nvlist representing this vdev's config.
*/
@@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vdev_config_generate_stats(vd, nv);
root_vdev_actions_getprogress(vd, nv);
+ top_vdev_actions_getprogress(vd, nv);
/*
* Note: this can be called from open context
@@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_resilver_txg != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
vd->vdev_resilver_txg);
+ if (vd->vdev_rebuild_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+ vd->vdev_rebuild_txg);
if (vd->vdev_faulted)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 3edd65c01..094530e9b 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
- zio->io_abd, zio->io_size,
- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
+ zio->io_priority == ZIO_PRIORITY_REBUILD ?
+ ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index e31271dcb..a8ef3d747 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1;
uint32_t zfs_vdev_initializing_max_active = 1;
uint32_t zfs_vdev_trim_min_active = 1;
uint32_t zfs_vdev_trim_max_active = 2;
+uint32_t zfs_vdev_rebuild_min_active = 1;
+uint32_t zfs_vdev_rebuild_max_active = 3;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_initializing_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
+ case ZIO_PRIORITY_REBUILD:
+ return (zfs_vdev_rebuild_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
+ case ZIO_PRIORITY_REBUILD:
+ return (zfs_vdev_rebuild_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio)
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
- zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+ zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio)
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
- zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+ zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
} else {
@@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
"Min active trim/discard I/Os per vdev");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+ "Max active rebuild I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+ "Min active rebuild I/Os per vdev");
+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
"Queue depth percentage for each top-level vdev");
/* END CSTYLED */
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
new file mode 100644
index 000000000..bf1079fd7
--- /dev/null
+++ b/module/zfs/vdev_rebuild.c
@@ -0,0 +1,1106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/vdev_impl.h>
+#include <sys/dsl_scan.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/zio.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+
+/*
+ * This file contains the sequential reconstruction implementation for
+ * resilvering. This form of resilvering is internally referred to as device
+ * rebuild to avoid conflating it with the traditional healing reconstruction
+ * performed by the dsl scan code.
+ *
+ * When replacing a device, or scrubbing the pool, ZFS has historically used
+ * a process called resilvering which is a form of healing reconstruction.
+ * This approach has the advantage that as blocks are read from disk their
+ * checksums can be immediately verified and the data repaired. Unfortunately,
+ * it also results in a random IO pattern to the disk even when extra care
+ * is taken to sequentialize the IO as much as possible. This substantially
+ * increases the time required to resilver the pool and restore redundancy.
+ *
+ * For mirrored devices it's possible to implement an alternate sequential
+ * reconstruction strategy when resilvering. Sequential reconstruction
+ * behaves like a traditional RAID rebuild and reconstructs a device in LBA
+ * order without verifying the checksum. After this phase completes a second
+ * scrub phase is started to verify all of the checksums. This two phase
+ * process will take longer than the healing reconstruction described above.
+ * However, it has that advantage that after the reconstruction first phase
+ * completes redundancy has been restored. At this point the pool can incur
+ * another device failure without risking data loss.
+ *
+ * There are a few noteworthy limitations and other advantages of resilvering
+ * using sequential reconstruction vs healing reconstruction.
+ *
+ * Limitations:
+ *
+ * - Only supported for mirror vdev types. Due to the variable stripe
+ * width used by raidz sequential reconstruction is not possible.
+ *
+ * - Block checksums are not verified during sequential reconstuction.
+ * Similar to traditional RAID the parity/mirror data is reconstructed
+ * but cannot be immediately double checked. For this reason when the
+ * last active resilver completes the pool is automatically scrubbed.
+ *
+ * - Deferred resilvers using sequential reconstruction are not currently
+ * supported. When adding another vdev to an active top-level resilver
+ * it must be restarted.
+ *
+ * Advantages:
+ *
+ * - Sequential reconstuction is performed in LBA order which may be faster
+ * than healing reconstuction particularly when using using HDDs (or
+ * especially with SMR devices). Only allocated capacity is resilvered.
+ *
+ * - Sequential reconstruction is not constrained by ZFS block boundaries.
+ * This allows it to issue larger IOs to disk which span multiple blocks
+ * allowing all of these logical blocks to be repaired with a single IO.
+ *
+ * - Unlike a healing resilver or scrub which are pool wide operations,
+ * sequential reconstruction is handled by the top-level mirror vdevs.
+ * This allows for it to be started or canceled on a top-level vdev
+ * without impacting any other top-level vdevs in the pool.
+ *
+ * - Data only referenced by a pool checkpoint will be repaired because
+ * that space is reflected in the space maps. This differs for a
+ * healing resilver or scrub which will not repair that data.
+ */
+
+
+/*
+ * Maximum number of queued rebuild I/Os top-level vdev. The number of
+ * concurrent rebuild I/Os issued to the device is controlled by the
+ * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
+ * options.
+ */
+unsigned int zfs_rebuild_queue_limit = 20;
+
+/*
+ * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
+ */
+unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+
+/*
+ * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
+ */
+static void vdev_rebuild_thread(void *arg);
+
+/*
+ * Clear the per-vdev rebuild bytes value for a vdev tree.
+ */
+static void
+clear_rebuild_bytes(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ clear_rebuild_bytes(vd->vdev_child[i]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_rebuild_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Determines whether a vdev_rebuild_thread() should be stopped.
+ */
+static boolean_t
+vdev_rebuild_should_stop(vdev_t *vd)
+{
+ return (!vdev_writeable(vd) || vd->vdev_removing ||
+ vd->vdev_rebuild_exit_wanted ||
+ vd->vdev_rebuild_cancel_wanted ||
+ vd->vdev_rebuild_reset_wanted);
+}
+
+/*
+ * Determine if the rebuild should be canceled. This may happen when all
+ * vdevs with MISSING DTLs are detached.
+ */
+static boolean_t
+vdev_rebuild_should_cancel(vdev_t *vd)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * The sync task for updating the on-disk state of a rebuild. This is
+ * scheduled by vdev_rebuild_range().
+ */
+static void
+vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ if (vr->vr_scan_offset[txg & TXG_MASK] > 0) {
+ vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK];
+ vr->vr_scan_offset[txg & TXG_MASK] = 0;
+ }
+
+ vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms +
+ NSEC2MSEC(gethrtime() - vr->vr_pass_start_time);
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Initialize the on-disk state for a new rebuild, start the rebuild thread.
+ */
+static void
+vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ ASSERT(vd->vdev_rebuilding);
+
+ spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
+ vrp->vrp_min_txg = 0;
+ vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+ vrp->vrp_start_time = gethrestime_sec();
+ vrp->vrp_scan_time_ms = 0;
+ vr->vr_prev_scan_time_ms = 0;
+
+ /*
+ * Rebuilds are currently only used when replacing a device, in which
+ * case there must be DTL_MISSING entries. In the future, we could
+ * allow rebuilds to be used in a way similar to a scrub. This would
+ * be useful because it would allow us to rebuild the space used by
+ * pool checkpoints.
+ */
+ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu started",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+ vd->vdev_rebuild_thread = thread_create(NULL, 0,
+ vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name)
+{
+ nvlist_t *aux = fnvlist_alloc();
+
+ fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential");
+ spa_event_notify(spa, vd, aux, name);
+ nvlist_free(aux);
+}
+
+/*
+ * Called to request that a new rebuild be started. The feature will remain
+ * active for the duration of the rebuild, then revert to the enabled state.
+ */
+static void
+vdev_rebuild_initiate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock));
+ ASSERT(!vd->vdev_rebuilding);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ vd->vdev_rebuilding = B_TRUE;
+
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,
+ (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
+
+ vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);
+}
+
+/*
+ * Update the on-disk state to completed when a rebuild finishes.
+ */
+static void
+vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
+ vrp->vrp_end_time = gethrestime_sec();
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
+ spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu complete",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+ vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+ /* Handles detaching of spares */
+ spa_async_request(spa, SPA_ASYNC_REBUILD_DONE);
+ vd->vdev_rebuilding = B_FALSE;
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ spa_notify_waiters(spa);
+ cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Update the on-disk state to canceled when a rebuild finishes.
+ */
+static void
+vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED;
+ vrp->vrp_end_time = gethrestime_sec();
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu canceled",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+ vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+ vd->vdev_rebuild_cancel_wanted = B_FALSE;
+ vd->vdev_rebuilding = B_FALSE;
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ spa_notify_waiters(spa);
+ cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Resets the progress of a running rebuild. This will occur when a new
+ * vdev is added to rebuild.
+ */
+static void
+vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+
+ vrp->vrp_last_offset = 0;
+ vrp->vrp_min_txg = 0;
+ vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+ vrp->vrp_bytes_scanned = 0;
+ vrp->vrp_bytes_issued = 0;
+ vrp->vrp_bytes_rebuilt = 0;
+ vrp->vrp_bytes_est = 0;
+ vrp->vrp_scan_time_ms = 0;
+ vr->vr_prev_scan_time_ms = 0;
+
+ /* See vdev_rebuild_initiate_sync comment */
+ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+ VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+
+ spa_history_log_internal(spa, "rebuild", tx,
+ "vdev_id=%llu vdev_guid=%llu reset",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+ vd->vdev_rebuild_reset_wanted = B_FALSE;
+ ASSERT(vd->vdev_rebuilding);
+
+ vd->vdev_rebuild_thread = thread_create(NULL, 0,
+ vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Clear the last rebuild status.
+ */
+void
+vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ objset_t *mos = spa_meta_objset(spa);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) ||
+ vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) {
+ mutex_exit(&vd->vdev_rebuild_lock);
+ return;
+ }
+
+ clear_rebuild_bytes(vd);
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+
+ if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
+ VERIFY0(zap_update(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp, tx));
+ }
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * The zio_done_func_t callback for each rebuild I/O issued. It's responsible
+ * for updating the rebuild stats and limiting the number of in flight I/Os.
+ */
+static void
+vdev_rebuild_cb(zio_t *zio)
+{
+ vdev_rebuild_t *vr = zio->io_private;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ vdev_t *vd = vr->vr_top_vdev;
+
+ mutex_enter(&vd->vdev_rebuild_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the top-level vdev was unavailable.
+ * Attempt to roll back to the last completed offset, in order
+ * resume from the correct location if the pool is resumed.
+ * (This works because spa_sync waits on spa_txg_zio before
+ * it runs sync tasks.)
+ */
+ uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else if (zio->io_error) {
+ vrp->vrp_errors++;
+ }
+
+ abd_free(zio->io_abd);
+
+ ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
+ vd->vdev_rebuild_inflight--;
+ cv_broadcast(&vd->vdev_rebuild_io_cv);
+ mutex_exit(&vd->vdev_rebuild_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * Rebuild the data in this range by constructing a special dummy block
+ * pointer for the given range. It has no relation to any existing blocks
+ * in the pool. But by disabling checksum verification and issuing a scrub
+ * I/O mirrored vdevs will replicate the block using any available mirror
+ * leaf vdevs.
+ */
+static void
+vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
+ uint64_t txg)
+{
+ vdev_t *vd = vr->vr_top_vdev;
+ spa_t *spa = vd->vdev_spa;
+ uint64_t psize = asize;
+
+ ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+
+ blkptr_t blk, *bp = &blk;
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], start);
+ DVA_SET_GANG(&bp->blk_dva[0], 0);
+ DVA_SET_ASIZE(&bp->blk_dva[0], asize);
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LSIZE(bp, psize);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ /*
+ * We increment the issued bytes by the asize rather than the psize
+ * so the scanned and issued bytes may be directly compared. This
+ * is consistent with the scrub/resilver issued reporting.
+ */
+ vr->vr_pass_bytes_issued += asize;
+ vr->vr_rebuild_phys.vrp_bytes_issued += asize;
+
+ zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
+ abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
+ ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_RESILVER, NULL));
+}
+
+/*
+ * Issues a rebuild I/O and takes care of rate limiting the number of queued
+ * rebuild I/Os. The provided start and size must be properly aligned for the
+ * top-level vdev type being rebuilt.
+ */
+static int
+vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
+{
+ uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
+ vdev_t *vd = vr->vr_top_vdev;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
+ ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
+
+ vr->vr_pass_bytes_scanned += size;
+ vr->vr_rebuild_phys.vrp_bytes_scanned += size;
+
+ mutex_enter(&vd->vdev_rebuild_io_lock);
+
+ /* Limit in flight rebuild I/Os */
+ while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
+ cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+
+ vd->vdev_rebuild_inflight++;
+ mutex_exit(&vd->vdev_rebuild_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ /* This is the first I/O for this txg. */
+ if (vr->vr_scan_offset[txg & TXG_MASK] == 0) {
+ vr->vr_scan_offset[txg & TXG_MASK] = start;
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_rebuild_update_sync,
+ (void *)(uintptr_t)vd->vdev_id, 2,
+ ZFS_SPACE_CHECK_RESERVED, tx);
+ }
+
+ /* When exiting write out our progress. */
+ if (vdev_rebuild_should_stop(vd)) {
+ mutex_enter(&vd->vdev_rebuild_io_lock);
+ vd->vdev_rebuild_inflight--;
+ mutex_exit(&vd->vdev_rebuild_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_rebuild_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ vr->vr_scan_offset[txg & TXG_MASK] = start + size;
+ vdev_rebuild_rebuild_block(vr, start, size, txg);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Split range into legally-sized logical chunks given the constraints of the
+ * top-level mirror vdev type.
+ */
+static uint64_t
+vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
+{
+ uint64_t chunk_size, max_asize, max_segment;
+
+ ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+
+ max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
+ 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
+ max_asize = vdev_psize_to_asize(vd, max_segment);
+ chunk_size = MIN(size, max_asize);
+
+ return (chunk_size);
+}
+
+/*
+ * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
+ */
+static int
+vdev_rebuild_ranges(vdev_rebuild_t *vr)
+{
+ vdev_t *vd = vr->vr_top_vdev;
+ zfs_btree_t *t = &vr->vr_scan_tree->rt_root;
+ zfs_btree_index_t idx;
+ int error;
+
+ for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+ rs = zfs_btree_next(t, &idx, &idx)) {
+ uint64_t start = rs_get_start(rs, vr->vr_scan_tree);
+ uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start;
+
+ /*
+ * zfs_scan_suspend_progress can be set to disable rebuild
+ * progress for testing. See comment in dsl_scan_sync().
+ */
+ while (zfs_scan_suspend_progress &&
+ !vdev_rebuild_should_stop(vd)) {
+ delay(hz);
+ }
+
+ while (size > 0) {
+ uint64_t chunk_size;
+
+ chunk_size = vdev_rebuild_chunk_size(vd, start, size);
+
+ error = vdev_rebuild_range(vr, start, chunk_size);
+ if (error != 0)
+ return (error);
+
+ size -= chunk_size;
+ start += chunk_size;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Calculates the estimated capacity which remains to be scanned. Since
+ * we traverse the pool in metaslab order only allocated capacity beyond
+ * the vrp_last_offset need be considered. All lower offsets must have
+ * already been rebuilt and are thus already included in vrp_bytes_scanned.
+ */
+static void
+vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ uint64_t bytes_est = vrp->vrp_bytes_scanned;
+
+ if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start)
+ return;
+
+ for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_ms[i];
+
+ mutex_enter(&msp->ms_lock);
+ bytes_est += metaslab_allocated_space(msp);
+ mutex_exit(&msp->ms_lock);
+ }
+
+ vrp->vrp_bytes_est = bytes_est;
+}
+
+/*
+ * Load from disk the top-level vdev's rebuild information.
+ */
+int
+vdev_rebuild_load(vdev_t *vd)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ spa_t *spa = vd->vdev_spa;
+ int err = 0;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ vd->vdev_rebuilding = B_FALSE;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ mutex_exit(&vd->vdev_rebuild_lock);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ ASSERT(vd->vdev_top == vd);
+
+ err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+ REBUILD_PHYS_ENTRIES, vrp);
+
+ /*
+ * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should
+ * not prevent a pool from being imported. Clear the rebuild
+ * status allowing a new resilver/rebuild to be started.
+ */
+ if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
+ bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ } else if (err) {
+ mutex_exit(&vd->vdev_rebuild_lock);
+ return (err);
+ }
+
+ vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms;
+ vr->vr_top_vdev = vd;
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ return (0);
+}
+
+/*
+ * Each scan thread is responsible for rebuilding a top-level vdev. The
+ * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
+ */
+static void
+vdev_rebuild_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ /*
+ * If there's a scrub in process request that it be stopped. This
+ * is not required for a correct rebuild, but we do want rebuilds to
+ * emulate the resilver behavior as much as possible.
+ */
+ dsl_pool_t *dsl = spa_get_dsl(spa);
+ if (dsl_scan_scrubbing(dsl))
+ dsl_scan_cancel(dsl);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ mutex_enter(&vd->vdev_rebuild_lock);
+
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(vd->vdev_rebuild_thread, !=, NULL);
+ ASSERT(vd->vdev_rebuilding);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
+ ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
+ ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
+
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+ vr->vr_top_vdev = vd;
+ vr->vr_scan_msp = NULL;
+ vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ vr->vr_pass_start_time = gethrtime();
+ vr->vr_pass_bytes_scanned = 0;
+ vr->vr_pass_bytes_issued = 0;
+
+ uint64_t update_est_time = gethrtime();
+ vdev_rebuild_update_bytes_est(vd, 0);
+
+ clear_rebuild_bytes(vr->vr_top_vdev);
+
+ mutex_exit(&vd->vdev_rebuild_lock);
+
+ /*
+ * Systematically walk the metaslabs and issue rebuild I/Os for
+ * all ranges in the allocated space map.
+ */
+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_ms[i];
+ vr->vr_scan_msp = msp;
+
+ /*
+ * Removal of vdevs from the vdev tree may eliminate the need
+ * for the rebuild, in which case it should be canceled. The
+ * vdev_rebuild_cancel_wanted flag is set until the sync task
+ * completes. This may be after the rebuild thread exits.
+ */
+ if (vdev_rebuild_should_cancel(vd)) {
+ vd->vdev_rebuild_cancel_wanted = B_TRUE;
+ error = EINTR;
+ break;
+ }
+
+ ASSERT0(range_tree_space(vr->vr_scan_tree));
+
+ /*
+ * Disable any new allocations to this metaslab and wait
+ * for any writes inflight to complete. This is needed to
+ * ensure all allocated ranges are rebuilt.
+ */
+ metaslab_disable(msp);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ txg_wait_synced(dsl, 0);
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * When a metaslab has been allocated from read its allocated
+ * ranges from the space map object in to the vr_scan_tree.
+ * Then add inflight / unflushed ranges and remove inflight /
+ * unflushed frees. This is the minimum range to be rebuilt.
+ */
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ vr->vr_scan_tree, SM_ALLOC));
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(range_tree_space(
+ msp->ms_allocating[i]));
+ }
+
+ range_tree_walk(msp->ms_unflushed_allocs,
+ range_tree_add, vr->vr_scan_tree);
+ range_tree_walk(msp->ms_unflushed_frees,
+ range_tree_remove, vr->vr_scan_tree);
+
+ /*
+ * Remove ranges which have already been rebuilt based
+ * on the last offset. This can happen when restarting
+ * a scan after exporting and re-importing the pool.
+ */
+ range_tree_clear(vr->vr_scan_tree, 0,
+ vrp->vrp_last_offset);
+ }
+
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+
+ /*
+ * To provide an accurate estimate re-calculate the estimated
+ * size every 5 minutes to account for recent allocations and
+ * frees made space maps which have not yet been rebuilt.
+ */
+ if (gethrtime() > update_est_time + SEC2NSEC(300)) {
+ update_est_time = gethrtime();
+ vdev_rebuild_update_bytes_est(vd, i);
+ }
+
+ /*
+ * Walk the allocated space map and issue the rebuild I/O.
+ */
+ error = vdev_rebuild_ranges(vr);
+ range_tree_vacate(vr->vr_scan_tree, NULL, NULL);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+
+ if (error != 0)
+ break;
+ }
+
+ range_tree_destroy(vr->vr_scan_tree);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /* Wait for any remaining rebuild I/O to complete */
+ mutex_enter(&vd->vdev_rebuild_io_lock);
+ while (vd->vdev_rebuild_inflight > 0)
+ cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+
+ mutex_exit(&vd->vdev_rebuild_io_lock);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (error == 0) {
+ /*
+ * After a successful rebuild clear the DTLs of all ranges
+ * which were missing when the rebuild was started. These
+ * ranges must have been rebuilt as a consequence of rebuilding
+ * all allocated space. Note that unlike a scrub or resilver
+ * the rebuild operation will reconstruct data only referenced
+ * by a pool checkpoint. See the dsl_scan_done() comments.
+ */
+ dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,
+ (void *)(uintptr_t)vd->vdev_id, 0,
+ ZFS_SPACE_CHECK_NONE, tx);
+ } else if (vd->vdev_rebuild_cancel_wanted) {
+ /*
+ * The rebuild operation was canceled. This will occur when
+ * a device participating in the rebuild is detached.
+ */
+ dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,
+ (void *)(uintptr_t)vd->vdev_id, 0,
+ ZFS_SPACE_CHECK_NONE, tx);
+ } else if (vd->vdev_rebuild_reset_wanted) {
+ /*
+ * Reset the running rebuild without canceling and restarting
+ * it. This will occur when a new device is attached and must
+ * participate in the rebuild.
+ */
+ dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,
+ (void *)(uintptr_t)vd->vdev_id, 0,
+ ZFS_SPACE_CHECK_NONE, tx);
+ } else {
+ /*
+ * The rebuild operation should be suspended. This may occur
+ * when detaching a child vdev or when exporting the pool. The
+ * rebuild is left in the active state so it will be resumed.
+ */
+ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+ vd->vdev_rebuilding = B_FALSE;
+ }
+
+ dmu_tx_commit(tx);
+
+ vd->vdev_rebuild_thread = NULL;
+ mutex_exit(&vd->vdev_rebuild_lock);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Returns B_TRUE if any top-level vdev are rebuilding.
+ */
+boolean_t
+vdev_rebuild_active(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ boolean_t ret = B_FALSE;
+
+ if (vd == spa->spa_root_vdev) {
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ ret = vdev_rebuild_active(vd->vdev_child[i]);
+ if (ret)
+ return (ret);
+ }
+ } else if (vd->vdev_top_zap != 0) {
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+ mutex_exit(&vd->vdev_rebuild_lock);
+ }
+
+ return (ret);
+}
+
+/*
+ * Start a rebuild operation. The rebuild may be restarted when the
+ * top-level vdev is currently actively rebuilding.
+ */
+void
+vdev_rebuild(vdev_t *vd)
+{
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys;
+
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(!vd->vdev_removing);
+ ASSERT(spa_feature_is_enabled(vd->vdev_spa,
+ SPA_FEATURE_DEVICE_REBUILD));
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (vd->vdev_rebuilding) {
+ ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE);
+
+ /*
+ * Signal a running rebuild operation that it should restart
+ * from the beginning because a new device was attached. The
+ * vdev_rebuild_reset_wanted flag is set until the sync task
+ * completes. This may be after the rebuild thread exits.
+ */
+ if (!vd->vdev_rebuild_reset_wanted)
+ vd->vdev_rebuild_reset_wanted = B_TRUE;
+ } else {
+ vdev_rebuild_initiate(vd);
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_restart_impl(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd == spa->spa_root_vdev) {
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_rebuild_restart_impl(vd->vdev_child[i]);
+
+ } else if (vd->vdev_top_zap != 0) {
+ vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE &&
+ vdev_writeable(vd) && !vd->vdev_rebuilding) {
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_DEVICE_REBUILD));
+ vd->vdev_rebuilding = B_TRUE;
+ vd->vdev_rebuild_thread = thread_create(NULL, 0,
+ vdev_rebuild_thread, vd, 0, &p0, TS_RUN,
+ maxclsyspri);
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+ }
+}
+
+/*
+ * Conditionally restart all of the vdev_rebuild_thread's for a pool. The
+ * feature flag must be active and the rebuild in the active state. This
+ * cannot be used to start a new rebuild.
+ */
+void
+vdev_rebuild_restart(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ vdev_rebuild_restart_impl(spa->spa_root_vdev);
+}
+
+/*
+ * Stop and wait for all of the vdev_rebuild_thread's associated with the
+ * vdev tree provide to be terminated (canceled or stopped).
+ */
+void
+vdev_rebuild_stop_wait(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (vd == spa->spa_root_vdev) {
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_rebuild_stop_wait(vd->vdev_child[i]);
+
+ } else if (vd->vdev_top_zap != 0) {
+ ASSERT(vd == vd->vdev_top);
+
+ mutex_enter(&vd->vdev_rebuild_lock);
+ if (vd->vdev_rebuild_thread != NULL) {
+ vd->vdev_rebuild_exit_wanted = B_TRUE;
+ while (vd->vdev_rebuilding) {
+ cv_wait(&vd->vdev_rebuild_cv,
+ &vd->vdev_rebuild_lock);
+ }
+ vd->vdev_rebuild_exit_wanted = B_FALSE;
+ }
+ mutex_exit(&vd->vdev_rebuild_lock);
+ }
+}
+
+/*
+ * Stop all rebuild operations but leave them in the active state so they
+ * will be resumed when importing the pool.
+ */
+void
+vdev_rebuild_stop_all(spa_t *spa)
+{
+ vdev_rebuild_stop_wait(spa->spa_root_vdev);
+}
+
+/*
+ * Rebuild statistics reported per top-level vdev.
+ */
+int
+vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
+{
+ spa_t *spa = tvd->vdev_spa;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+ return (SET_ERROR(ENOTSUP));
+
+ if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0)
+ return (SET_ERROR(EINVAL));
+
+ int error = zap_contains(spa_meta_objset(spa),
+ tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
+
+ if (error == ENOENT) {
+ bzero(vrs, sizeof (vdev_rebuild_stat_t));
+ vrs->vrs_state = VDEV_REBUILD_NONE;
+ error = 0;
+ } else if (error == 0) {
+ vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+ vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+ mutex_enter(&tvd->vdev_rebuild_lock);
+ vrs->vrs_state = vrp->vrp_rebuild_state;
+ vrs->vrs_start_time = vrp->vrp_start_time;
+ vrs->vrs_end_time = vrp->vrp_end_time;
+ vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms;
+ vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned;
+ vrs->vrs_bytes_issued = vrp->vrp_bytes_issued;
+ vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt;
+ vrs->vrs_bytes_est = vrp->vrp_bytes_est;
+ vrs->vrs_errors = vrp->vrp_errors;
+ vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() -
+ vr->vr_pass_start_time);
+ vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
+ vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+ mutex_exit(&tvd->vdev_rebuild_lock);
+ }
+
+ return (error);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+ "Max segment size in bytes of rebuild reads");
+/* END CSTYLED */
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 4122114b5..1d2ae6270 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1938,8 +1938,9 @@ static int
zfs_ioc_vdev_attach(zfs_cmd_t *zc)
{
spa_t *spa;
- int replacing = zc->zc_cookie;
nvlist_t *config;
+ int replacing = zc->zc_cookie;
+ int rebuild = zc->zc_simple;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
@@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) == 0) {
- error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
+ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
+ rebuild);
nvlist_free(config);
}
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 765ffea8a..f6478dd0d 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -487,7 +487,8 @@ tests = ['zpool_wait_discard', 'zpool_wait_freeing',
tags = ['functional', 'cli_root', 'zpool_wait']
[tests/functional/cli_root/zpool_wait/scan]
-tests = ['zpool_wait_replace_cancel', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
+tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
+ 'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
tags = ['functional', 'cli_root', 'zpool_wait']
@@ -748,7 +749,11 @@ tests = ['rename_dirs_001_pos']
tags = ['functional', 'rename_dirs']
[tests/functional/replacement]
-tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos']
+tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
+ 'attach_resilver', 'detach', 'rebuild_disabled_feature',
+ 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
+ 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
+ 'scrub_cancel']
tags = ['functional', 'replacement']
[tests/functional/reservation]
@@ -762,10 +767,6 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
'reservation_022_pos']
tags = ['functional', 'reservation']
-[tests/functional/resilver]
-tests = ['resilver_restart_001', 'resilver_restart_002']
-tags = ['functional', 'resilver']
-
[tests/functional/rootpool]
tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
tags = ['functional', 'rootpool']
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 9fbcc37c6..5e07cda4d 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -2222,26 +2222,27 @@ function check_pool_status # pool token keyword <verbose>
if [[ $verbose == true ]]; then
log_note $scan
fi
- echo $scan | grep -i "$keyword" > /dev/null 2>&1
+ echo $scan | egrep -i "$keyword" > /dev/null 2>&1
return $?
}
#
# The following functions are instance of check_pool_status()
-# is_pool_resilvering - to check if the pool is resilver in progress
-# is_pool_resilvered - to check if the pool is resilver completed
-# is_pool_scrubbing - to check if the pool is scrub in progress
-# is_pool_scrubbed - to check if the pool is scrub completed
-# is_pool_scrub_stopped - to check if the pool is scrub stopped
-# is_pool_scrub_paused - to check if the pool has scrub paused
-# is_pool_removing - to check if the pool is removing a vdev
-# is_pool_removed - to check if the pool is remove completed
-# is_pool_discarding - to check if the pool has checkpoint being discarded
+# is_pool_resilvering - to check if the pool resilver is in progress
+# is_pool_resilvered - to check if the pool resilver is completed
+# is_pool_scrubbing - to check if the pool scrub is in progress
+# is_pool_scrubbed - to check if the pool scrub is completed
+# is_pool_scrub_stopped - to check if the pool scrub is stopped
+# is_pool_scrub_paused - to check if the pool scrub has paused
+# is_pool_removing - to check if the pool removing is a vdev
+# is_pool_removed - to check if the pool remove is completed
+# is_pool_discarding - to check if the pool checkpoint is being discarded
#
function is_pool_resilvering #pool <verbose>
{
- check_pool_status "$1" "scan" "resilver in progress since " $2
+ check_pool_status "$1" "scan" \
+ "resilver[ ()0-9A-Za-z_-]* in progress since" $2
return $?
}
@@ -3487,7 +3488,7 @@ function wait_scrubbed
typeset pool=${1:-$TESTPOOL}
while true ; do
is_pool_scrubbed $pool && break
- log_must sleep 1
+ sleep 1
done
}
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index 24f3e50bb..c56518c55 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -65,7 +65,6 @@ SUBDIRS = \
rename_dirs \
replacement \
reservation \
- resilver \
rootpool \
rsend \
scrub_mirror \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index ee5b2b4e1..4991b76bf 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -79,6 +79,7 @@ typeset -a properties=(
"feature@redacted_datasets"
"feature@bookmark_written"
"feature@log_spacemap"
+ "feature@device_rebuild"
)
if is_linux || is_freebsd; then
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am
index 6a21cac4f..451d83a79 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am
@@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \
cleanup.ksh \
zpool_wait_replace.ksh \
zpool_wait_replace_cancel.ksh \
+ zpool_wait_rebuild.ksh \
zpool_wait_resilver.ksh \
zpool_wait_scrub_basic.ksh \
zpool_wait_scrub_cancel.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh
new file mode 100755
index 000000000..8cd586459
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib
+
+#
+# DESCRIPTION:
+# 'zpool wait' works when waiting for sequential resilvering to complete.
+#
+# STRATEGY:
+# 1. Attach a device to the pool so that sequential resilvering starts.
+# 2. Start 'zpool wait'.
+# 3. Monitor the waiting process to make sure it returns neither too soon nor
+# too late.
+# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using
+# 'zpool wait'.
+#
+
+function cleanup
+{
+ remove_io_delay
+ kill_if_running $pid
+ get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \
+ log_must zpool detach $TESTPOOL $DISK2
+}
+
+typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL"
+typeset pid
+
+log_onexit cleanup
+
+add_io_delay $TESTPOOL
+
+# Test 'zpool wait -t resilver'
+log_must zpool attach -s $TESTPOOL $DISK1 $DISK2
+log_bkgrnd zpool wait -t resilver $TESTPOOL
+pid=$!
+check_while_waiting $pid "$IN_PROGRESS_CHECK"
+
+log_must zpool detach $TESTPOOL $DISK2
+
+# Test 'zpool attach -w'
+log_bkgrnd zpool attach -sw $TESTPOOL $DISK1 $DISK2
+pid=$!
+while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do
+ log_must sleep .5
+done
+check_while_waiting $pid "$IN_PROGRESS_CHECK"
+
+log_pass "'zpool wait -t resilver' and 'zpool attach -w' work."
diff --git a/tests/zfs-tests/tests/functional/replacement/Makefile.am b/tests/zfs-tests/tests/functional/replacement/Makefile.am
index d47fcd5e1..fe6e49121 100644
--- a/tests/zfs-tests/tests/functional/replacement/Makefile.am
+++ b/tests/zfs-tests/tests/functional/replacement/Makefile.am
@@ -2,9 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/replacement
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
- replacement_001_pos.ksh \
- replacement_002_pos.ksh \
- replacement_003_pos.ksh
+ attach_import.ksh \
+ attach_multiple.ksh \
+ attach_rebuild.ksh \
+ attach_resilver.ksh \
+ detach.ksh \
+ rebuild_disabled_feature.ksh \
+ rebuild_multiple.ksh \
+ rebuild_raidz.ksh \
+ replace_import.ksh \
+ replace_rebuild.ksh \
+ replace_resilver.ksh \
+ resilver_restart_001.ksh \
+ resilver_restart_002.ksh \
+ scrub_cancel.ksh
dist_pkgdata_DATA = \
replacement.cfg
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_import.ksh b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh
new file mode 100755
index 000000000..e2749b164
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh
@@ -0,0 +1,67 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify that on import an in progress attach operation is resumed.
+#
+# Strategy:
+# 1. For both healing and sequential resilvering.
+# a. Create a pool
+# b. Add a vdev with 'zpool attach' and resilver (-s) it.
+# c. Export the pool
+# d. Import the pool
+# e. Verify the 'zpool attach' resumed resilvering
+# f. Destroy the pool
+#
+
+function cleanup
+{
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]}
+}
+
+log_assert "Verify attach is resumed on import"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]}
+
+# Verify healing and sequential resilver resume on import.
+for arg in "" "-s"; do
+ log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]}
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+ log_must zpool attach $arg $TESTPOOL1 ${VDEV_FILES[0]} ${VDEV_FILES[1]}
+ log_must is_pool_resilvering $TESTPOOL1
+ log_must zpool export $TESTPOOL1
+ log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1
+ log_must is_pool_resilvering $TESTPOOL1
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+ log_must zpool wait -t resilver $TESTPOOL1
+ log_must is_pool_resilvered $TESTPOOL1
+ destroy_pool $TESTPOOL1
+done
+
+log_pass "Verify attach is resumed on import"
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh
new file mode 100755
index 000000000..b3192b2bf
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh
@@ -0,0 +1,111 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify that attach/detach work while resilvering and attaching
+# multiple vdevs.
+#
+# Strategy:
+# 1. Create a single vdev pool
+# 2. While healing or sequential resilvering:
+# a. Attach a vdev to convert the pool to a mirror.
+# b. Attach a vdev to convert the pool to a 3-way mirror.
+# c. Verify the original vdev cannot be removed (no redundant copies)
+# d. Detach a vdev. Healing and sequential resilver remain running.
+# e. Detach a vdev. Healing resilver remains running, sequential
+# resilver is canceled.
+# f. Wait for resilver to complete.
+#
+
+function cleanup
+{
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]}
+}
+
+log_assert "Verify attach/detech with multiple vdevs"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]}
+
+# Verify resilver resumes on import.
+log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]}
+
+for replace_mode in "healing" "sequential"; do
+ #
+ # Resilvers abort the dsl_scan and reconfigure it for resilvering.
+ # Rebuilds cancel the dsl_scan and start the vdev_rebuild thread.
+ #
+ if [[ "$replace_mode" = "healing" ]]; then
+ flags=""
+ else
+ flags="-s"
+ fi
+
+ log_mustnot is_pool_resilvering $TESTPOOL1
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+ # Attach first vdev (stripe -> mirror)
+ log_must zpool attach $flags $TESTPOOL1 \
+ ${VDEV_FILES[0]} ${VDEV_FILES[1]}
+ log_must is_pool_resilvering $TESTPOOL1
+
+ # Attach second vdev (2-way -> 3-way mirror)
+ log_must zpool attach $flags $TESTPOOL1 \
+ ${VDEV_FILES[1]} ${VDEV_FILES[2]}
+ log_must is_pool_resilvering $TESTPOOL1
+
+ # Original vdev cannot be detached until there is sufficent redundancy.
+ log_mustnot zpool detach $TESTPOOL1 ${VDEV_FILES[0]}
+
+ # Detach first vdev (resilver keeps running)
+ log_must zpool detach $TESTPOOL1 ${VDEV_FILES[1]}
+ log_must is_pool_resilvering $TESTPOOL1
+
+ #
+ # Detach second vdev. There's a difference in behavior between
+ # healing and sequential resilvers. A healing resilver will not be
+ # cancelled even though there's nothing on the original vdev which
+ # needs to be rebuilt. A sequential resilver on the otherhand is
+ # canceled when returning to a non-redundant striped layout. At
+ # some point the healing resilver behavior should be updated to match
+ # the sequential resilver behavior.
+ #
+ log_must zpool detach $TESTPOOL1 ${VDEV_FILES[2]}
+
+ if [[ "$replace_mode" = "healing" ]]; then
+ log_must is_pool_resilvering $TESTPOOL1
+ else
+ log_mustnot is_pool_resilvering $TESTPOOL1
+ fi
+
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ log_must zpool wait $TESTPOOL1
+done
+
+log_pass "Verify attach/detech with multiple vdevs"
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh
new file mode 100755
index 000000000..e9427c7ad
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh
@@ -0,0 +1,173 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Attaching disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+# 1. Create multidisk pools (stripe/mirror/raidz) and
+# start some random I/O
+# 2. Attach a disk to the pool.
+# 3. Verify the integrity of the file system and the resilvering.
+#
+# NOTE: Raidz does not support the sequential resilver (-s) option.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ if [[ -n "$child_pids" ]]; then
+ for wait_pid in $child_pids; do
+ kill $wait_pid
+ done
+ fi
+
+ if poolexists $TESTPOOL1; then
+ destroy_pool $TESTPOOL1
+ fi
+
+ [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function attach_test
+{
+ typeset -i iters=2
+ typeset -i index=0
+ typeset opt=$1
+ typeset disk1=$2
+ typeset disk2=$3
+
+ typeset i=0
+ while [[ $i -lt $iters ]]; do
+ log_note "Invoking file_trunc with: $options_display"
+ file_trunc $options $TESTDIR/$TESTFILE.$i &
+ typeset pid=$!
+
+ sleep 1
+
+ child_pids="$child_pids $pid"
+ ((i = i + 1))
+ done
+
+ log_must zpool attach -sw $opt $TESTPOOL1 $disk1 $disk2
+
+ for wait_pid in $child_pids; do
+ kill $wait_pid
+ done
+ child_pids=""
+
+ log_must zpool export $TESTPOOL1
+ log_must zpool import -d $TESTDIR $TESTPOOL1
+ log_must zfs umount $TESTPOOL1/$TESTFS1
+ log_must zdb -cdui $TESTPOOL1/$TESTFS1
+ log_must zfs mount $TESTPOOL1/$TESTFS1
+ verify_pool $TESTPOOL1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+ truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+ specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+ ((i = i + 1))
+done
+
+#
+# Create a replacement disk special file.
+#
+truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+
+for op in "" "-f"; do
+ create_pool $TESTPOOL1 mirror $specials_list
+ log_must zfs create $TESTPOOL1/$TESTFS1
+ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+ attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+ zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+ if [[ $? -ne 0 ]]; then
+ log_fail "$REPLACEFILE is not present."
+ fi
+
+ destroy_pool $TESTPOOL1
+done
+
+log_note "Verify 'zpool attach' fails with non-mirrors."
+
+for type in "" "raidz" "raidz1"; do
+ for op in "" "-f"; do
+ create_pool $TESTPOOL1 $type $specials_list
+ log_must zfs create $TESTPOOL1/$TESTFS1
+ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+ log_mustnot zpool attach -s "$opt" $TESTDIR/$TESTFILE1.1 \
+ $TESTDIR/$REPLACEFILE
+
+ zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+ if [[ $? -eq 0 ]]; then
+ log_fail "$REPLACEFILE should not be present."
+ fi
+
+ destroy_pool $TESTPOOL1
+ done
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh
index 391aa5cf0..4261d4d67 100755
--- a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh
@@ -104,9 +104,7 @@ function attach_test
((i = i + 1))
done
- log_must zpool attach $opt $TESTPOOL1 $disk1 $disk2
-
- sleep 10
+ log_must zpool attach -w $opt $TESTPOOL1 $disk1 $disk2
for wait_pid in $child_pids
do
@@ -119,13 +117,13 @@ function attach_test
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
-
+ verify_pool $TESTPOOL1
}
specials_list=""
i=0
-while [[ $i != 2 ]]; do
- mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+while [[ $i != 3 ]]; do
+ truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
@@ -134,7 +132,7 @@ done
#
# Create a replacement disk special file.
#
-mkfile $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
for op in "" "-f"; do
create_pool $TESTPOOL1 mirror $specials_list
@@ -143,7 +141,7 @@ for op in "" "-f"; do
attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
- zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
+ zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -ne 0 ]]; then
log_fail "$REPLACEFILE is not present."
fi
@@ -162,7 +160,7 @@ for type in "" "raidz" "raidz1"; do
log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \
$TESTDIR/$REPLACEFILE
- zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
+ zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -eq 0 ]]; then
log_fail "$REPLACEFILE should not be present."
fi
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh b/tests/zfs-tests/tests/functional/replacement/detach.ksh
index 71b9602ee..aa3ec4f7a 100755
--- a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/replacement/detach.ksh
@@ -121,8 +121,8 @@ function detach_test
specials_list=""
i=0
-while [[ $i != 2 ]]; do
- mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+while [[ $i != 3 ]]; do
+ truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
@@ -134,7 +134,7 @@ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
detach_test $TESTDIR/$TESTFILE1.1
-zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1"
+zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1"
if [[ $? -eq 0 ]]; then
log_fail "$TESTFILE1.1 should no longer be present."
fi
@@ -143,14 +143,14 @@ destroy_pool $TESTPOOL1
log_note "Verify 'zpool detach' fails with non-mirrors."
-for type in "" "raidz" "raidz1" ; do
+for type in "" "raidz" "raidz1"; do
create_pool $TESTPOOL1 $type $specials_list
log_must zfs create $TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
log_mustnot zpool detach $TESTDIR/$TESTFILE1.1
- zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1"
+ zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1"
if [[ $? -ne 0 ]]; then
log_fail "$TESTFILE1.1 is not present."
fi
diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh
new file mode 100755
index 000000000..d17d83b78
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify device_rebuild feature flags.
+#
+# Strategy:
+# 1. Create a pool with all features disabled.
+# 2. Verify 'zpool replace -s' fails and the feature is disabled.
+# 3. Enable the device_rebuild feature.
+# 4. Verify 'zpool replace -s' works and the feature is active.
+# 5. Wait for the feature to return to enabled.
+#
+
+function cleanup
+{
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+function check_feature_flag
+{
+ feature=$1
+ pool=$2
+ expected_value=$3
+
+ value="$(zpool get -H -o property,value all $pool | \
+ egrep "$feature" | awk '{print $2}')"
+ if [ "$value" = "$expected_value" ]; then
+ log_note "$feature verified to be $value"
+ else
+ log_fail "$feature should be $expected_value but is $value"
+ fi
+}
+
+log_assert "Verify device_rebuild feature flags."
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+log_must zpool create -d $TESTPOOL1 ${VDEV_FILES[@]}
+
+log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "disabled"
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+log_must zpool set feature@device_rebuild=enabled $TESTPOOL1
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "active"
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+log_must zpool wait -t resilver $TESTPOOL1
+check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "enabled"
+
+log_pass "Verify device_rebuild feature flags."
diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh
new file mode 100755
index 000000000..7775cbff4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh
@@ -0,0 +1,126 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Sequential reconstruction (unlike healing reconstruction) operate on the
+# top-level vdev. This means that a sequential resilver operation can be
+# started/stopped on a different top-level vdev without impacting other
+# sequential resilvers.
+#
+# STRATEGY:
+# 1. Create a mirrored pool.
+#
+
+function cleanup
+{
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE $SPARE_VDEV_FILE2
+}
+
+function check_history
+{
+ pool=$1
+ msg=$2
+ exp=$3
+
+ count=$(zpool history -i $pool | grep "rebuild" | grep -c "$msg")
+ if [[ "$count" -ne "$exp" ]]; then
+ log_fail "Expected $exp rebuild '$msg' messages, found $count"
+ else
+ log_note "Found $count/$exp rebuild '$msg' messages"
+ fi
+}
+
+log_assert "Rebuilds operate on the top-level vdevs"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} \
+ $SPARE_VDEV_FILE $SPARE_VDEV_FILE2
+
+# Verify two sequential resilvers can run concurrently.
+log_must zpool create -f $TESTPOOL1 \
+ mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \
+ mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]}
+log_must zfs create $TESTPOOL1/$TESTFS
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
+log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32
+log_must zpool sync $TESTPOOL1
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2
+
+check_history $TESTPOOL1 "started" 2
+check_history $TESTPOOL1 "reset" 0
+check_history $TESTPOOL1 "complete" 0
+check_history $TESTPOOL1 "canceled" 0
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+log_must zpool wait -t resilver $TESTPOOL1
+
+check_history $TESTPOOL1 "complete" 2
+destroy_pool $TESTPOOL1
+
+# Verify canceling one resilver (zpool detach) does not impact others.
+log_must zpool create -f $TESTPOOL1 \
+ mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \
+ mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]}
+log_must zfs create $TESTPOOL1/$TESTFS
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
+log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32
+log_must zpool sync $TESTPOOL1
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2
+
+check_history $TESTPOOL1 "started" 2
+check_history $TESTPOOL1 "reset" 0
+check_history $TESTPOOL1 "complete" 0
+check_history $TESTPOOL1 "canceled" 0
+
+log_must zpool detach $TESTPOOL1 $SPARE_VDEV_FILE2
+
+check_history $TESTPOOL1 "complete" 0
+check_history $TESTPOOL1 "canceled" 1
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+log_must zpool wait -t resilver $TESTPOOL1
+
+check_history $TESTPOOL1 "complete" 1
+check_history $TESTPOOL1 "canceled" 1
+destroy_pool $TESTPOOL1
+
+log_pass "Rebuilds operate on the top-level vdevs"
diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh
new file mode 100755
index 000000000..c919b44b2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh
@@ -0,0 +1,70 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Executing 'zpool replace -s' for raidz vdevs failed. Sequential
+# resilvers are only allowed for stripe/mirror pools.
+#
+# STRATEGY:
+# 1. Create a raidz pool, verify 'zpool replace -s' fails
+# 2. Create a stripe/mirror pool, verify 'zpool replace -s' passes
+#
+
+function cleanup
+{
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+log_assert "Sequential resilver is not allowed for raidz vdevs"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+# raidz[1-3]
+for vdev_type in "raidz" "raidz2" "raidz3"; do
+ log_must zpool create -f $TESTPOOL1 $vdev_type ${VDEV_FILES[@]}
+ log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} \
+ $SPARE_VDEV_FILE
+ destroy_pool $TESTPOOL1
+done
+
+# stripe
+log_must zpool create $TESTPOOL1 ${VDEV_FILES[@]}
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+destroy_pool $TESTPOOL1
+
+# mirror
+log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]}
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+destroy_pool $TESTPOOL1
+
+log_pass "Sequential resilver is not allowed for raidz vdevs"
diff --git a/tests/zfs-tests/tests/functional/replacement/replace_import.ksh b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh
new file mode 100755
index 000000000..35d51d939
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh
@@ -0,0 +1,67 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify that on import an in progress replace operation is resumed.
+#
+# Strategy:
+# 1. For both healing and sequential resilvering replace:
+# a. Create a pool
+# b. Repalce a vdev with 'zpool replace' to resilver (-s) it.
+# c. Export the pool
+# d. Import the pool
+# e. Verify the 'zpool replace' resumed resilvering.
+# f. Destroy the pool
+#
+
+function cleanup
+{
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+log_assert "Verify replace is resumed on import"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+# Verify healing and sequential resilver resume on import.
+for arg in "" "-s"; do
+ log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]}
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+ log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE
+ log_must is_pool_resilvering $TESTPOOL1
+ log_must zpool export $TESTPOOL1
+ log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1
+ log_must is_pool_resilvering $TESTPOOL1
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+ log_must zpool wait -t resilver $TESTPOOL1
+ log_must is_pool_resilvered $TESTPOOL1
+ destroy_pool $TESTPOOL1
+done
+
+log_pass "Verify replace is resumed on import"
diff --git a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh
new file mode 100755
index 000000000..599735228
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh
@@ -0,0 +1,158 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Replacing disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+# 1. Create multidisk pools (stripe/mirror) and
+# start some random I/O
+# 2. Replace a disk in the pool with another disk.
+# 3. Verify the integrity of the file system and the rebuilding.
+#
+# NOTE: Raidz does not support the sequential resilver (-s) option.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ if [[ -n "$child_pids" ]]; then
+ for wait_pid in $child_pids
+ do
+ kill $wait_pid
+ done
+ fi
+
+ if poolexists $TESTPOOL1; then
+ destroy_pool $TESTPOOL1
+ fi
+
+ [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk with -r during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function replace_test
+{
+ typeset -i iters=2
+ typeset -i index=0
+ typeset opt=$1
+ typeset disk1=$2
+ typeset disk2=$3
+
+ typeset i=0
+ while [[ $i -lt $iters ]]; do
+ log_note "Invoking file_trunc with: $options_display"
+ file_trunc $options $TESTDIR/$TESTFILE.$i &
+ typeset pid=$!
+
+ sleep 1
+
+ child_pids="$child_pids $pid"
+ ((i = i + 1))
+ done
+
+ log_must zpool replace -sw $opt $TESTPOOL1 $disk1 $disk2
+
+ for wait_pid in $child_pids
+ do
+ kill $wait_pid
+ done
+ child_pids=""
+
+ log_must zpool export $TESTPOOL1
+ log_must zpool import -d $TESTDIR $TESTPOOL1
+ log_must zfs umount $TESTPOOL1/$TESTFS1
+ log_must zdb -cdui $TESTPOOL1/$TESTFS1
+ log_must zfs mount $TESTPOOL1/$TESTFS1
+ verify_pool $TESTPOOL1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+ log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+ specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+ ((i = i + 1))
+done
+
+#
+# Create a replacement disk special file.
+#
+log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+
+for type in "" "mirror"; do
+ for op in "" "-f"; do
+ create_pool $TESTPOOL1 $type $specials_list
+ log_must zfs create $TESTPOOL1/$TESTFS1
+ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+ replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+ zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+ if [[ $? -ne 0 ]]; then
+ log_fail "$REPLACEFILE is not present."
+ fi
+
+ destroy_pool $TESTPOOL1
+ log_must rm -rf /$TESTPOOL1
+ done
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh
index 8f40436ff..253cf65e4 100755
--- a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh
@@ -104,9 +104,7 @@ function replace_test
((i = i + 1))
done
- log_must zpool replace $opt $TESTPOOL1 $disk1 $disk2
-
- sleep 10
+ log_must zpool replace -w $opt $TESTPOOL1 $disk1 $disk2
for wait_pid in $child_pids
do
@@ -119,11 +117,12 @@ function replace_test
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
+ verify_pool $TESTPOOL1
}
specials_list=""
i=0
-while [[ $i != 2 ]]; do
+while [[ $i != 3 ]]; do
log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
@@ -143,7 +142,7 @@ for type in "" "raidz" "mirror"; do
replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
- zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
+ zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -ne 0 ]]; then
log_fail "$REPLACEFILE is not present."
fi
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement.cfg b/tests/zfs-tests/tests/functional/replacement/replacement.cfg
index b2ba1b885..271317b1c 100644
--- a/tests/zfs-tests/tests/functional/replacement/replacement.cfg
+++ b/tests/zfs-tests/tests/functional/replacement/replacement.cfg
@@ -36,3 +36,8 @@ export HOLES_SEED=${HOLES_SEED-""}
export HOLES_FILEOFFSET=${HOLES_FILEOFFSET-""}
export HOLES_COUNT=${HOLES_COUNT-"16384"} # FILESIZE/BLKSIZE/8
export REPLACEFILE="sparedisk"
+
+set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4}
+export VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 ))
+export SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1
+export SPARE_VDEV_FILE2=$TEST_BASE_DIR/spare-2
diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh
index 9af1c972f..7896b2dbe 100755
--- a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh
+++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh
@@ -20,7 +20,7 @@
#
. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
@@ -50,7 +50,7 @@ function cleanup
$ORIG_SCAN_SUSPEND_PROGRESS
log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX
log_must zinject -c all
- destroy_pool $TESTPOOL
+ destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
}
@@ -70,7 +70,7 @@ function verify_restarts # <msg> <cnt> <defer>
[[ -z "$defer" ]] && return
# use zdb to find which vdevs have the resilver defer flag
- VDEV_DEFERS=$(zdb -C $TESTPOOL | awk '
+ VDEV_DEFERS=$(zdb -C $TESTPOOL1 | awk '
/children/ { gsub(/[^0-9]/, ""); child = $0 }
/com\.datto:resilver_defer$/ { print child }
')
@@ -106,17 +106,17 @@ log_must set_tunable32 ZEVENT_LEN_MAX 512
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
-log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL \
+log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL1 \
raidz ${VDEV_FILES[@]}
# create 4 filesystems
for fs in fs{0..3}
do
- log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL/$fs
+ log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL1/$fs
done
# simultaneously write 16M to each of them
-set -A DATAPATHS /$TESTPOOL/fs{0..3}/dat.0
+set -A DATAPATHS /$TESTPOOL1/fs{0..3}/dat.0
log_note "Writing data files"
for path in ${DATAPATHS[@]}
do
@@ -131,7 +131,7 @@ do
if [[ $test == "with" ]]
then
- log_must zpool set feature@resilver_defer=enabled $TESTPOOL
+ log_must zpool set feature@resilver_defer=enabled $TESTPOOL1
RESTARTS=( "${DEFER_RESTARTS[@]}" )
VDEVS=( "${DEFER_VDEVS[@]}" )
VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}"
@@ -144,7 +144,7 @@ do
log_must set_tunable32 RESILVER_MIN_TIME_MS 50
# initiate a resilver and suspend the scan as soon as possible
- log_must zpool replace $TESTPOOL $VDEV_REPLACE
+ log_must zpool replace $TESTPOOL1 $VDEV_REPLACE
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
# there should only be 1 resilver start
@@ -152,16 +152,16 @@ do
# offline then online a vdev to introduce a new DTL range after current
# scan, which should restart (or defer) the resilver
- log_must zpool offline $TESTPOOL ${VDEV_FILES[2]}
- log_must zpool sync $TESTPOOL
- log_must zpool online $TESTPOOL ${VDEV_FILES[2]}
- log_must zpool sync $TESTPOOL
+ log_must zpool offline $TESTPOOL1 ${VDEV_FILES[2]}
+ log_must zpool sync $TESTPOOL1
+ log_must zpool online $TESTPOOL1 ${VDEV_FILES[2]}
+ log_must zpool sync $TESTPOOL1
# there should now be 2 resilver starts w/o defer, 1 with defer
verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}"
# inject read io errors on vdev and verify resilver does not restart
- log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL
+ log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL1
log_must cat ${DATAPATHS[1]} > /dev/null
log_must zinject -c all
@@ -173,17 +173,12 @@ do
log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
# wait for resilver to finish
- for iter in {0..59}
- do
- is_pool_resilvered $TESTPOOL && break
- sleep 1
- done
- is_pool_resilvered $TESTPOOL ||
- log_fail "resilver timed out"
+ log_must zpool wait -t resilver $TESTPOOL1
+ log_must is_pool_resilvered $TESTPOOL1
# wait for a few txg's to see if a resilver happens
- log_must zpool sync $TESTPOOL
- log_must zpool sync $TESTPOOL
+ log_must zpool sync $TESTPOOL1
+ log_must zpool sync $TESTPOOL1
# there should now be 2 resilver starts
verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}"
diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh
index ebe5e693b..48763f9b2 100755
--- a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh
+++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh
@@ -20,7 +20,7 @@
#
. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
@@ -40,7 +40,7 @@
function cleanup
{
log_must zinject -c all
- destroy_pool $TESTPOOL
+ destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY
}
@@ -56,25 +56,25 @@ log_must set_tunable32 SCAN_LEGACY 1
# create the pool and a 32M file (32k blocks)
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE
-log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]}
-log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1
+log_must zpool create -f -O recordsize=1k $TESTPOOL1 ${VDEV_FILES[0]}
+log_must dd if=/dev/urandom of=/$TESTPOOL1/file bs=1M count=32 > /dev/null 2>&1
# determine objset/object
-objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
-object=$(ls -i /$TESTPOOL/file | awk '{print $1}')
+objset=$(zdb -d $TESTPOOL1/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
+object=$(ls -i /$TESTPOOL1/file | awk '{print $1}')
# inject event to cause error during resilver
-log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL
+log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL1
# clear events and start resilver
log_must zpool events -c
-log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE
+log_must zpool attach $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE
log_note "waiting for read errors to start showing up"
for iter in {0..59}
do
- zpool sync $TESTPOOL
- err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}')
+ zpool sync $TESTPOOL1
+ err=$(zpool status $TESTPOOL1 | grep ${VDEV_FILES[0]} | awk '{print $3}')
(( $err > 0 )) && break
sleep 1
done
@@ -92,8 +92,8 @@ done
(( $finish == 0 )) && log_fail "resilver took too long to finish"
# wait a few syncs to ensure that zfs does not restart the resilver
-log_must zpool sync $TESTPOOL
-log_must zpool sync $TESTPOOL
+log_must zpool sync $TESTPOOL1
+log_must zpool sync $TESTPOOL1
# check if resilver was restarted
start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l)
diff --git a/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh
new file mode 100755
index 000000000..da8a0a26e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh
@@ -0,0 +1,112 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Verify scrub behaves as intended when contending with a healing or
+# sequential resilver.
+#
+# STRATEGY:
+# 1. Create a pool
+# 2. Add a modest amount of data to the pool.
+# 3. For healing and sequential resilver:
+# a. Start scrubbing.
+# b. Verify a resilver can be started and it cancels the scrub.
+# c. Verify a scrub cannot be started when resilvering
+#
+
+function cleanup
+{
+ log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+ $ORIG_SCAN_SUSPEND_PROGRESS
+ destroy_pool $TESTPOOL1
+ rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+log_assert "Scrub was cancelled by resilver"
+
+ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS)
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]}
+log_must zfs create $TESTPOOL1/$TESTFS
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
+log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=64
+log_must zpool sync $TESTPOOL1
+
+# Request a healing or sequential resilver
+for replace_mode in "healing" "sequential"; do
+
+ #
+ # Healing resilvers abort the dsl_scan and reconfigure it for
+ # resilvering. Sequential resilvers cancel the dsl_scan and start
+ # the vdev_rebuild thread.
+ #
+ if [[ "$replace_mode" = "healing" ]]; then
+ history_msg="scan aborted, restarting"
+ flags=""
+ else
+ history_msg="scan cancelled"
+ flags="-s"
+ fi
+
+ # Limit scanning time and suspend the scan as soon as possible.
+ log_must set_tunable32 RESILVER_MIN_TIME_MS 50
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+ # Initiate a scrub.
+ log_must zpool scrub $TESTPOOL1
+
+ # Initiate a resilver to cancel the scrub.
+ log_must zpool replace $flags $TESTPOOL1 ${VDEV_FILES[1]} \
+ $SPARE_VDEV_FILE
+
+ # Verify the scrub was canceled, it may take a few seconds to exit.
+ while is_pool_scrubbing $TESTPOOL1; do
+ sleep 1
+ done
+ log_mustnot is_pool_scrubbing $TESTPOOL1
+
+ # Verify a scrub cannot be started while resilvering.
+ log_must is_pool_resilvering $TESTPOOL1
+ log_mustnot zpool scrub $TESTPOOL1
+
+ # Unsuspend resilver.
+ log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+ log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
+
+ # Wait for resilver to finish then put the original back.
+ log_must zpool wait $TESTPOOL1
+ log_must zpool replace $flags -w $TESTPOOL1 $SPARE_VDEV_FILE \
+ ${VDEV_FILES[1]}
+done
+log_pass "Scrub was cancelled by resilver"
+
diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am
deleted file mode 100644
index 38136a843..000000000
--- a/tests/zfs-tests/tests/functional/resilver/Makefile.am
+++ /dev/null
@@ -1,9 +0,0 @@
-pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver
-dist_pkgdata_SCRIPTS = \
- setup.ksh \
- cleanup.ksh \
- resilver_restart_001.ksh \
- resilver_restart_002.ksh
-
-dist_pkgdata_DATA = \
- resilver.cfg
diff --git a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh b/tests/zfs-tests/tests/functional/resilver/cleanup.ksh
deleted file mode 100755
index 4dfa81424..000000000
--- a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
-
-verify_runnable "global"
-
-log_pass
diff --git a/tests/zfs-tests/tests/functional/resilver/resilver.cfg b/tests/zfs-tests/tests/functional/resilver/resilver.cfg
deleted file mode 100644
index 88dfd24ae..000000000
--- a/tests/zfs-tests/tests/functional/resilver/resilver.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-
-verify_runnable "global"
-
-set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4}
-SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1
-
-VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 ))
diff --git a/tests/zfs-tests/tests/functional/resilver/setup.ksh b/tests/zfs-tests/tests/functional/resilver/setup.ksh
deleted file mode 100755
index 4dfa81424..000000000
--- a/tests/zfs-tests/tests/functional/resilver/setup.ksh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
-
-verify_runnable "global"
-
-log_pass