aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
Diffstat (limited to 'module')
-rw-r--r--module/zfs/mmp.c104
-rw-r--r--module/zfs/spa.c4
-rw-r--r--module/zfs/vdev_label.c1
3 files changed, 68 insertions, 41 deletions
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 14379d804..e50e35665 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -280,6 +280,59 @@ mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd)
return (error_mask);
}
+/*
+ * MMP writes are issued on a fixed schedule, but may complete at variable,
+ * much longer, intervals. The mmp_delay captures long periods between
+ * successful writes for any reason, including disk latency, scheduling delays,
+ * etc.
+ *
+ * The mmp_delay is usually calculated as a decaying average, but if the latest
+ * delay is higher we do not average it, so that we do not hide sudden spikes
+ * which the importing host must wait for.
+ *
+ * If writes are occurring frequently, such as due to a high rate of txg syncs,
+ * the mmp_delay could become very small. Since those short delays depend on
+ * activity we cannot count on, we never allow mmp_delay to get lower than rate
+ * expected if only mmp_thread writes occur.
+ *
+ * If an mmp write was skipped or fails, and we have already waited longer than
+ * mmp_delay, we need to update it so the next write reflects the longer delay.
+ *
+ * Do not set mmp_delay if the multihost property is not on, so as not to
+ * trigger an activity check on import.
+ */
+static void
+mmp_delay_update(spa_t *spa, boolean_t write_completed)
+{
+ mmp_thread_t *mts = &spa->spa_mmp;
+ hrtime_t delay = gethrtime() - mts->mmp_last_write;
+
+ ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
+
+ if (spa_multihost(spa) == B_FALSE) {
+ mts->mmp_delay = 0;
+ return;
+ }
+
+ if (delay > mts->mmp_delay)
+ mts->mmp_delay = delay;
+
+ if (write_completed == B_FALSE)
+ return;
+
+ mts->mmp_last_write = gethrtime();
+
+ /*
+ * strictly less than, in case delay was changed above.
+ */
+ if (delay < mts->mmp_delay) {
+ hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
+ vdev_count_leaves(spa);
+ mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
+ min_delay);
+ }
+}
+
static void
mmp_write_done(zio_t *zio)
{
@@ -291,38 +344,8 @@ mmp_write_done(zio_t *zio)
uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
- if (zio->io_error)
- goto unlock;
-
- /*
- * Mmp writes are queued on a fixed schedule, but under many
- * circumstances, such as a busy device or faulty hardware,
- * the writes will complete at variable, much longer,
- * intervals. In these cases, another node checking for
- * activity must wait longer to account for these delays.
- *
- * The mmp_delay is calculated as a decaying average of the interval
- * between completed mmp writes. This is used to predict how long
- * the import must wait to detect activity in the pool, before
- * concluding it is not in use.
- *
- * Do not set mmp_delay if the multihost property is not on,
- * so as not to trigger an activity check on import.
- */
- if (spa_multihost(spa)) {
- hrtime_t delay = gethrtime() - mts->mmp_last_write;
+ mmp_delay_update(spa, (zio->io_error == 0));
- if (delay > mts->mmp_delay)
- mts->mmp_delay = delay;
- else
- mts->mmp_delay = (delay + mts->mmp_delay * 127) /
- 128;
- } else {
- mts->mmp_delay = 0;
- }
- mts->mmp_last_write = gethrtime();
-
-unlock:
vd->vdev_mmp_pending = 0;
vd->vdev_mmp_kstat_id = 0;
@@ -348,6 +371,7 @@ mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
mutex_enter(&mmp->mmp_io_lock);
mmp->mmp_ub = *ub;
mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+ mmp_delay_update(spa, B_TRUE);
mutex_exit(&mmp->mmp_io_lock);
}
@@ -386,6 +410,7 @@ mmp_write_uberblock(spa_t *spa)
*/
if (error) {
+ mmp_delay_update(spa, B_FALSE);
if (mmp->mmp_skip_error == error) {
spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
} else {
@@ -463,15 +488,14 @@ mmp_thread(void *arg)
MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
boolean_t suspended = spa_suspended(spa);
boolean_t multihost = spa_multihost(spa);
- hrtime_t start, next_time;
+ hrtime_t next_time;
- start = gethrtime();
- if (multihost) {
- next_time = start + mmp_interval /
+ if (multihost)
+ next_time = gethrtime() + mmp_interval /
MAX(vdev_count_leaves(spa), 1);
- } else {
- next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
- }
+ else
+ next_time = gethrtime() +
+ MSEC2NSEC(MMP_DEFAULT_INTERVAL);
/*
* MMP off => on, or suspended => !suspended:
@@ -515,11 +539,11 @@ mmp_thread(void *arg)
* mmp_interval * mmp_fail_intervals nanoseconds.
*/
if (!suspended && mmp_fail_intervals && multihost &&
- (start - mmp->mmp_last_write) > max_fail_ns) {
+ (gethrtime() - mmp->mmp_last_write) > max_fail_ns) {
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llus; suspending pool",
spa_name(spa),
- NSEC2SEC(start - mmp->mmp_last_write));
+ NSEC2SEC(gethrtime() - mmp->mmp_last_write));
zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 4b6196cc3..53b5aabf0 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -2462,6 +2462,10 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_delay = MAX(import_delay, import_intervals *
MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)));
+ zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u "
+ "leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals,
+ vdev_count_leaves(spa));
+
/* Add a small random factor in case of simultaneous imports (0-25%) */
import_expire = gethrtime() + import_delay +
(import_delay * spa_get_random(250) / 1000);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 26fc93645..4fee4bc7a 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1495,7 +1495,6 @@ retry:
if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
goto retry;
-
if (spa_multihost(spa))
mmp_update_uberblock(spa, ub);