From e48afbc4ebc64712f56220b65ed12c536ef0e114 Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Tue, 5 Dec 2017 09:45:46 -0800 Subject: OpenZFS 9464 - txg_kick() fails to see that we are quiescing txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes Creating a fragmented pool in a DCenter VM and continuously writing to it with multiple instances of randwritecomp, we get the following output from txg.d: 0ms 311MB in 4114ms (95% p1) 75MB/s 544MB (76%) 336us 153ms 0ms 0ms 8MB in 51ms ( 0% p1) 163MB/s 474MB (66%) 129us 34ms 0ms 0ms 366MB in 4454ms (93% p1) 82MB/s 572MB (79%) 498us 20ms 0ms 0ms 406MB in 5212ms (95% p1) 77MB/s 591MB (82%) 661us 37ms 0ms 0ms 340MB in 5110ms (94% p1) 66MB/s 622MB (86%) 1048us 41ms 1ms 0ms 3MB in 61ms ( 0% p1) 51MB/s 419MB (58%) 33us 0ms 0ms 0ms 361MB in 3555ms (88% p1) 101MB/s 542MB (75%) 335us 40ms 0ms 0ms 356MB in 4592ms (92% p1) 77MB/s 561MB (78%) 430us 89ms 1ms 0ms 11MB in 129ms (13% p1) 90MB/s 507MB (70%) 222us 15ms 0ms 0ms 281MB in 2520ms (89% p1) 111MB/s 542MB (75%) 334us 42ms 0ms 0ms 383MB in 3666ms (91% p1) 104MB/s 557MB (77%) 411us 133ms 0ms 0ms 404MB in 5757ms (94% p1) 70MB/s 635MB (88%) 1274us 123ms 2ms 4ms 367MB in 4172ms (89% p1) 88MB/s 556MB (77%) 401us 51ms 0ms 0ms 42MB in 470ms (44% p1) 90MB/s 557MB (77%) 412us 43ms 0ms 0ms 261MB in 2273ms (88% p1) 114MB/s 556MB (77%) 407us 27ms 0ms 0ms 394MB in 3646ms (85% p1) 108MB/s 552MB (77%) 393us 304ms 0ms 0ms 275MB in 2416ms (89% p1) 113MB/s 510MB (71%) 200us 53ms 0ms 0ms 9MB in 53ms ( 0% p1) 169MB/s 483MB (67%) 140us 100ms 1ms The TXGs that are getting synced and don't have lots of changes are pushed by txg_kick() which basically forces the current open txg to get to the quiesced state: if (tx->tx_syncing_txg == 0 && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; cv_broadcast(&tx->tx_quiesce_more_cv); } The problem is that the above code doesn't check if we are currently quiescing anything (only if a quiesce or a sync has been requested, ..etc) so the following scenario can happen: 1] We have an open txg A that had enough dirty data (more than zfs_dirty_data_sync) and it was pushed to the quiesced state, and opened a new txg B. No txg is currently being synced. 2] Immediately after the opening of B, txg_kick() was run by some other write (and because of A's dirty data) and saw that we are not currently syncing any txg and no one has requested quiescing so it requests one by bumping tx_quiesce_txg_waiting and broadcasts the quiesce thread. 3] The quiesce thread just passed txg A to be synced and sees that a quiescing request has been sent to it so it immediately grabs B without letting it gather enough data, putting it in a quiesced state and opening a new txg C. In this scenario txg B, is an example of how the entries of interest show up in the txg.d output. Ideally we would like txg_kick() to get triggered only when we are sure that we are not syncing AND not quiescing any txg. This way we can kick an open TXG to the quiescing state when we are sure that there is nothing going on and we would benefit from the different states running concurrently. Authored by: Serapheim Dimitropoulos Reviewed by: Matt Ahrens Reviewed by: Brad Lewis Reviewed by: Andriy Gapon Approved by: Dan McDonald Ported-by: Brian Behlendorf OpenZFS-issue: https://illumos.org/issues/9464 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/1cd7635b Closes #7587 --- module/zfs/dmu_tx.c | 7 ++++--- module/zfs/txg.c | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 7 deletions(-) (limited to 'module/zfs') diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 374bb710c..ab6ec29e7 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1097,10 +1097,11 @@ dmu_tx_wait(dmu_tx_t *tx) tx->tx_needassign_txh = NULL; } else { /* - * A dnode is assigned to the quiescing txg. Wait for its - * transaction to complete. + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. */ - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } spa_tx_assign_add_nsecs(spa, gethrtime() - before); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 30f251573..16b2f845b 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -476,6 +476,30 @@ txg_wait_callbacks(dsl_pool_t *dp) taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + static void txg_sync_thread(void *arg) { @@ -504,7 +528,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0 && + !txg_has_quiesced_to_sync(dp) && dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); @@ -517,7 +541,7 @@ txg_sync_thread(void *arg) * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ - while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -532,6 +556,7 @@ txg_sync_thread(void *arg) * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ + ASSERT(tx->tx_quiesced_txg != 0); txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; @@ -582,7 +607,7 @@ txg_quiesce_thread(void *arg) */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - tx->tx_quiesced_txg != 0)) + txg_has_quiesced_to_sync(dp))) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) @@ -592,6 +617,8 @@ txg_quiesce_thread(void *arg) dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); @@ -600,6 +627,7 @@ txg_quiesce_thread(void *arg) * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; tx->tx_quiesced_txg = txg; DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); @@ -699,7 +727,8 @@ txg_kick(dsl_pool_t *dp) ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); - if (tx->tx_syncing_txg == 0 && + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { -- cgit v1.2.3