diff options
author | Alexander Motin <[email protected]> | 2022-05-24 12:46:35 -0400 |
---|---|---|
committer | Tony Hutter <[email protected]> | 2022-09-26 14:55:27 -0700 |
commit | 33223cbc3cbed37fdcecedc18b4b82406c73c01b (patch) | |
tree | 68a60589313fde3a77248647071134969ec89261 /module/zfs | |
parent | 91e02156ddea27d980fa8e5f7f3d10dda06139d2 (diff) |
Refactor Log Size Limit
Original Log Size Limit implementation blocked all writes in case of
limit reached until the TXG is committed and the log is freed. It
caused huge delays and following speed spikes in application writes.
This implementation instead smoothly throttles writes, using exactly
the same mechanism as used for dirty data.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: jxdking <[email protected]>
Signed-off-by: Alexander Motin <[email protected]>
Sponsored-By: iXsystems, Inc.
Issue #12284
Closes #13476
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dmu_tx.c | 58 | ||||
-rw-r--r-- | module/zfs/dsl_pool.c | 17 |
2 files changed, 48 insertions, 27 deletions
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5fa516866..1eed0526b 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, - { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -781,34 +781,49 @@ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; - uint64_t delay_min_bytes = + uint64_t delay_min_bytes, wrlog; + hrtime_t wakeup, tx_time = 0, now; + + /* Calculate minimum transaction time for the dirty data amount. */ + delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + if (dirty > delay_min_bytes) { + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which + * could cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); - if (dirty <= delay_min_bytes) - return; + tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / + (zfs_dirty_data_max - dirty); + } - /* - * The caller has already waited until we are under the max. - * We make them pass us the amount of dirty data so we don't - * have to handle the case of it being >= the max, which could - * cause a divide-by-zero if it's == the max. - */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + /* Calculate minimum transaction time for the TX_WRITE log size. */ + wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); + delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + if (wrlog >= zfs_wrlog_data_max) { + tx_time = zfs_delay_max_ns; + } else if (wrlog > delay_min_bytes) { + tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / + (zfs_wrlog_data_max - wrlog), tx_time); + } + if (tx_time == 0) + return; + + tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, - uint64_t, min_tx_time); + uint64_t, tx_time); mutex_enter(&dp->dp_lock); - wakeup = MAX(tx->tx_start + min_tx_time, - dp->dp_last_wakeup + min_tx_time); + wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); @@ -886,8 +901,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } if (!tx->tx_dirty_delayed && - dsl_pool_wrlog_over_max(tx->tx_pool)) { - DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + dsl_pool_need_wrlog_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); return (SET_ERROR(ERESTART)); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index b91dd4cfa..4036c8671 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -105,9 +105,8 @@ int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* - * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. - * Once it is reached, write operation is blocked, - * until log data is cleared out after txg sync. + * The upper limit of TX_WRITE log data. Write operations are throttled + * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ unsigned long zfs_wrlog_data_max = 0; @@ -621,15 +620,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = - zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t -dsl_pool_wrlog_over_max(dsl_pool_t *dp) +dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { - return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); + uint64_t delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + + return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void @@ -639,6 +641,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); + /* Compact per-CPU sums after the big change. */ + (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG |