diff options
author | Prakash Surya <[email protected]> | 2013-12-30 09:30:00 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2014-02-21 16:10:49 -0800 |
commit | da8ccd0ee0d4eed08200e706cb2ca1da3bdfb5cf (patch) | |
tree | 1143c40bffebce762270bb5a5a369d1add0fd5d5 | |
parent | 77765b540b79bdc42d4f25f174004bbbd06b0a32 (diff) |
Prioritize "metadata" in arc_get_data_buf
When the arc is at it's size limit and a new buffer is added, data will
be evicted (or recycled) from the arc to make room for this new buffer.
As far as I can tell, this is to try and keep the arc from over stepping
it's bounds (i.e. keep it below the size limitation placed on it).
This makes sense conceptually, but there appears to be a subtle flaw in
its current implementation, resulting in metadata buffers being
throttled. When it evicts from the arc's lists, it also passes in a
"type" so as to remove a buffer of the same type that it is adding. The
problem with this is that once the size limit is hit, the ratio of
"metadata" to "data" contained in the arc essentially becomes fixed.
For example, consider the following scenario:
* the size of the arc is capped at 10G
* the meta_limit is capped at 4G
* 9G of the arc contains "data"
* 1G of the arc contains "metadata"
Now, every time a new "metadata" buffer is created and added to the arc,
an older "metadata" buffer(s) will be removed from the arc; preserving
the 9G "data" to 1G "metadata" ratio that was in-place when the size
limit was reached. This occurs even though the amount of "metadata" is
far below the "metadata" limit. This can result in the arc behaving
pathologically for certain workloads.
To fix this, the arc_get_data_buf function was modified to evict "data"
from the arc even when adding a "metadata" buffer; unless it's at the
"metadata" limit. In addition, arc_evict now more closely resembles
arc_evict_ghost; such that when evicting "data" from the arc, it may
make a second pass over the arc lists and evict "metadata" if it cannot
meet the eviction size the first time around.
Signed-off-by: Prakash Surya <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Issue #2110
-rw-r--r-- | module/zfs/arc.c | 62 |
1 files changed, 40 insertions, 22 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f28748423..242d0c8c5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1890,6 +1890,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; +top: mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); @@ -2005,6 +2006,15 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_evicted < bytes)) { + /* Prevent second pass from recycling metadata into data */ + recycle = FALSE; + type = ARC_BUFC_METADATA; + list = &state->arcs_list[type]; + goto top; + } + if (bytes_evicted < bytes) dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state); @@ -2146,16 +2156,9 @@ arc_adjust(void) adjustment = MIN((int64_t)(arc_size - arc_c), (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + if (adjustment > 0 && arc_mru->arcs_size > 0) { + delta = MIN(arc_mru->arcs_size, adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } - - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, - ARC_BUFC_METADATA); } /* @@ -2164,17 +2167,9 @@ arc_adjust(void) adjustment = arc_size - arc_c; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + if (adjustment > 0 && arc_mfu->arcs_size > 0) { + delta = MIN(arc_mfu->arcs_size, adjustment); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } - - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t delta = MIN(adjustment, - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, - ARC_BUFC_METADATA); } /* @@ -2752,6 +2747,8 @@ arc_get_data_buf(arc_buf_t *buf) arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t evict = ARC_BUFC_DATA; + boolean_t recycle = TRUE; arc_adapt(size, state); @@ -2792,7 +2789,24 @@ arc_get_data_buf(arc_buf_t *buf) mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { + /* + * Evict data buffers prior to metadata buffers, unless we're + * over the metadata limit and adding a metadata buffer. + */ + if (type == ARC_BUFC_METADATA) { + if (arc_meta_used >= arc_meta_limit) + evict = ARC_BUFC_METADATA; + else + /* + * In this case, we're evicting data while + * adding metadata. Thus, to prevent recycling a + * data buffer into a metadata buffer, recycling + * is disabled in the following arc_evict call. + */ + recycle = FALSE; + } + + if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); @@ -2803,8 +2817,10 @@ arc_get_data_buf(arc_buf_t *buf) * via the prune callback to drop references. The * prune callback in run in the context of the reclaim * thread to avoid deadlocking on the hash_lock. + * Of course, only do this when recycle is true. */ - cv_signal(&arc_reclaim_thr_cv); + if (recycle) + cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); @@ -2812,7 +2828,9 @@ arc_get_data_buf(arc_buf_t *buf) atomic_add_64(&arc_size, size); } - ARCSTAT_BUMP(arcstat_recycle_miss); + /* Only bump this if we tried to recycle and failed */ + if (recycle) + ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out: |