aboutsummaryrefslogtreecommitdiffstats
path: root/cmd/zpool
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2023-01-25 11:28:54 -0800
committerGitHub <[email protected]>2023-01-25 11:28:54 -0800
commitc85ac731a0ec16e4277857b55ebe123c552365b6 (patch)
tree27887ebb5f2cbf5b9c7b6ddec90a27c2b1515bb8 /cmd/zpool
parent9cd71c8604d52def22ffaddc35755712f0fb9349 (diff)
Improve resilver ETAs
When resilvering the estimated time remaining is calculated using the average issue rate over the current pass. Where the current pass starts when a scan was started, or restarted, if the pool was exported/imported. For dRAID pools in particular this can result in wildly optimistic estimates since the issue rate will be very high while scanning when non-degraded regions of the pool are scanned. Once repair I/O starts being issued performance drops to a realistic number but the estimated performance is still significantly skewed. To address this we redefine a pass such that it starts after a scanning phase completes so the issue rate is more reflective of recent performance. Additionally, the zfs_scan_report_txgs module option can be set to reset the pass statistics more often. Reviewed-by: Akash B <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #14410
Diffstat (limited to 'cmd/zpool')
-rw-r--r--cmd/zpool/zpool_main.c33
1 files changed, 22 insertions, 11 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 93d6a1898..efb2d10e5 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -7524,19 +7524,20 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));
- assert(ps->pss_func == POOL_SCAN_SCRUB ||
- ps->pss_func == POOL_SCAN_RESILVER);
+ int is_resilver = ps->pss_func == POOL_SCAN_RESILVER;
+ int is_scrub = ps->pss_func == POOL_SCAN_SCRUB;
+ assert(is_resilver || is_scrub);
/* Scan is finished or canceled. */
if (ps->pss_state == DSS_FINISHED) {
secs_to_dhms(end - start, time_buf);
- if (ps->pss_func == POOL_SCAN_SCRUB) {
+ if (is_scrub) {
(void) printf(gettext("scrub repaired %s "
"in %s with %llu errors on %s"), processed_buf,
time_buf, (u_longlong_t)ps->pss_errors,
ctime(&end));
- } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ } else if (is_resilver) {
(void) printf(gettext("resilvered %s "
"in %s with %llu errors on %s"), processed_buf,
time_buf, (u_longlong_t)ps->pss_errors,
@@ -7544,10 +7545,10 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
}
return;
} else if (ps->pss_state == DSS_CANCELED) {
- if (ps->pss_func == POOL_SCAN_SCRUB) {
+ if (is_scrub) {
(void) printf(gettext("scrub canceled on %s"),
ctime(&end));
- } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ } else if (is_resilver) {
(void) printf(gettext("resilver canceled on %s"),
ctime(&end));
}
@@ -7557,7 +7558,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
assert(ps->pss_state == DSS_SCANNING);
/* Scan is in progress. Resilvers can't be paused. */
- if (ps->pss_func == POOL_SCAN_SCRUB) {
+ if (is_scrub) {
if (pause == 0) {
(void) printf(gettext("scrub in progress since %s"),
ctime(&start));
@@ -7567,7 +7568,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
(void) printf(gettext("\tscrub started on %s"),
ctime(&start));
}
- } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ } else if (is_resilver) {
(void) printf(gettext("resilver in progress since %s"),
ctime(&start));
}
@@ -7609,17 +7610,27 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
scanned_buf, issued_buf, total_buf);
}
- if (ps->pss_func == POOL_SCAN_RESILVER) {
+ if (is_resilver) {
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
processed_buf, 100 * fraction_done);
- } else if (ps->pss_func == POOL_SCAN_SCRUB) {
+ } else if (is_scrub) {
(void) printf(gettext("\t%s repaired, %.2f%% done"),
processed_buf, 100 * fraction_done);
}
if (pause == 0) {
+ /*
+ * Only provide an estimate iff:
+ * 1) the time remaining is valid, and
+ * 2) the issue rate exceeds 10 MB/s, and
+ * 3) it's either:
+ * a) a resilver which has started repairs, or
+ * b) a scrub which has entered the issue phase.
+ */
if (total_secs_left != UINT64_MAX &&
- issue_rate >= 10 * 1024 * 1024) {
+ issue_rate >= 10 * 1024 * 1024 &&
+ ((is_resilver && ps->pss_processed > 0) ||
+ (is_scrub && issued > 0))) {
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "