From f4c594da94d856c422512a54e48070f890b2685b Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Thu, 25 Apr 2019 13:16:24 -0400 Subject: Fixes for the DMU free throttle This patch fixes 2 issues with the DMU free throttle implemented in dmu_free_long_range(). The first issue is that get_next_chunk() was calculating the number of L1 blocks the free would dirty incorrectly. In some cases involving extremely large files, this code would greatly overestimate the number of effected L1 blocks, causing excessive calls to txg_wait_open(). This patch corrects the calculation. The second issue is that the free throttle uses the total number of free'd blocks in all (open, quiescing, and syncing) txgs to determine whether to throttle. This causes large frees (such as those created by the first issue) to cause 4 txg syncs before any further frees were allowed to proceed. This patch ensures that the accounting is done entirely in a per-txg fashion, so that frees from a given txg don't affect those that immediately follow it. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Signed-off-by: Tom Caputi Closes #8655 --- module/zfs/dmu.c | 65 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'module') diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b047a93ef..1697a6320 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -724,13 +724,18 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) ASSERT3U(minimum, <=, *start); - if (*start - minimum <= iblkrange * maxblks) { + /* + * Check if we can free the entire range assuming that all of the + * L1 blocks in this range have data. If we can, we use this + * worst case value as an estimate so we can avoid having to look + * at the object's actual data. + */ + uint64_t total_l1blks = + (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / + iblkrange; + if (total_l1blks <= maxblks) { + *l1blks = total_l1blks; *start = minimum; - /* - * Assume full L1 blocks and 128k recordsize to approximate the - * expected number of L1 blocks in this chunk - */ - *l1blks = minimum / (1024 * 128 * 1024); return (0); } ASSERT(ISP2(iblkrange)); @@ -745,6 +750,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) * to search. */ (*start)--; + err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); @@ -763,6 +769,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) if (*start < minimum) *start = minimum; *l1blks = blks; + return (0); } @@ -809,7 +816,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; - uint64_t long_free_dirty_all_txgs = 0; uint64_t l1blks; dmu_tx_t *tx; @@ -827,25 +833,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, chunk_len = chunk_end - chunk_begin; - mutex_enter(&dp->dp_lock); - for (int t = 0; t < TXG_SIZE; t++) { - long_free_dirty_all_txgs += - dp->dp_long_free_dirty_pertxg[t]; - } - mutex_exit(&dp->dp_lock); - - /* - * To avoid filling up a TXG with just frees wait for - * the next TXG to open before freeing more chunks if - * we have reached the threshold of frees - */ - if (dirty_frees_threshold != 0 && - long_free_dirty_all_txgs >= dirty_frees_threshold) { - DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); - txg_wait_open(dp, 0, B_TRUE); - continue; - } - tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); @@ -860,6 +847,26 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, return (err); } + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&dp->dp_lock); + uint64_t long_free_dirty = + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; + mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees, wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees. + */ + if (dirty_frees_threshold != 0 && + long_free_dirty >= dirty_frees_threshold) { + DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); + dmu_tx_commit(tx); + txg_wait_open(dp, 0, B_TRUE); + continue; + } + /* * In order to prevent unnecessary write throttling, for each * TXG, we track the cumulative size of L1 blocks being dirtied @@ -871,12 +878,12 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, * blocks taking up a large percentage of zfs_dirty_data_max. */ mutex_enter(&dp->dp_lock); - dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += l1blks << dn->dn_indblkshift; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, - uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, - uint64_t, dmu_tx_get_txg(tx)); + uint64_t, long_free_dirty, uint64_t, chunk_len, + uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); -- cgit v1.2.3