summaryrefslogtreecommitdiffstats
path: root/module/zfs/arc.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r--module/zfs/arc.c227
1 files changed, 149 insertions, 78 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index cbe0a6028..ccc9510fd 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -172,8 +172,11 @@ int arc_evict_iterations = 100;
/* number of seconds before growing cache again */
int zfs_arc_grow_retry = 5;
-/* shift of arc_c for calculating both min and max arc_p */
-int zfs_arc_p_min_shift = 4;
+/* disable anon data aggressively growing arc_p */
+int zfs_arc_p_aggressive_disable = 1;
+
+/* disable arc_p adapt dampener in arc_adapt */
+int zfs_arc_p_dampener_disable = 1;
/* log2(fraction of arc to reclaim) */
int zfs_arc_shrink_shift = 5;
@@ -305,6 +308,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_size;
kstat_named_t arcstat_hdr_size;
kstat_named_t arcstat_data_size;
+ kstat_named_t arcstat_meta_size;
kstat_named_t arcstat_other_size;
kstat_named_t arcstat_anon_size;
kstat_named_t arcstat_anon_evict_data;
@@ -392,6 +396,7 @@ static arc_stats_t arc_stats = {
{ "size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
+ { "meta_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
{ "anon_size", KSTAT_DATA_UINT64 },
{ "anon_evict_data", KSTAT_DATA_UINT64 },
@@ -1364,6 +1369,9 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
+ case ARC_SPACE_META:
+ ARCSTAT_INCR(arcstat_meta_size, space);
+ break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, space);
break;
@@ -1375,7 +1383,9 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
break;
}
- ARCSTAT_INCR(arcstat_meta_used, space);
+ if (type != ARC_SPACE_DATA)
+ ARCSTAT_INCR(arcstat_meta_used, space);
+
atomic_add_64(&arc_size, space);
}
@@ -1390,6 +1400,9 @@ arc_space_return(uint64_t space, arc_space_type_t type)
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
+ case ARC_SPACE_META:
+ ARCSTAT_INCR(arcstat_meta_size, -space);
+ break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, -space);
break;
@@ -1401,10 +1414,13 @@ arc_space_return(uint64_t space, arc_space_type_t type)
break;
}
- ASSERT(arc_meta_used >= space);
- if (arc_meta_max < arc_meta_used)
- arc_meta_max = arc_meta_used;
- ARCSTAT_INCR(arcstat_meta_used, -space);
+ if (type != ARC_SPACE_DATA) {
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ ARCSTAT_INCR(arcstat_meta_used, -space);
+ }
+
ASSERT(arc_size >= space);
atomic_add_64(&arc_size, -space);
}
@@ -1601,12 +1617,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf, zio_buf_free);
- arc_space_return(size, ARC_SPACE_DATA);
+ arc_space_return(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
arc_buf_data_free(buf, zio_data_buf_free);
- ARCSTAT_INCR(arcstat_data_size, -size);
- atomic_add_64(&arc_size, -size);
+ arc_space_return(size, ARC_SPACE_DATA);
}
}
if (list_link_active(&buf->b_hdr->b_arc_node)) {
@@ -1887,6 +1902,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+top:
mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
@@ -2002,6 +2018,15 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&state->arcs_mtx);
+ if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ (bytes < 0 || bytes_evicted < bytes)) {
+ /* Prevent second pass from recycling metadata into data */
+ recycle = FALSE;
+ type = ARC_BUFC_METADATA;
+ list = &state->arcs_list[type];
+ goto top;
+ }
+
if (bytes_evicted < bytes)
dprintf("only evicted %lld bytes from %x\n",
(longlong_t)bytes_evicted, state);
@@ -2141,19 +2166,11 @@ arc_adjust(void)
*/
adjustment = MIN((int64_t)(arc_size - arc_c),
- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
- arc_p));
+ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
+ if (adjustment > 0 && arc_mru->arcs_size > 0) {
+ delta = MIN(arc_mru->arcs_size, adjustment);
(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
- adjustment -= delta;
- }
-
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
- (void) arc_evict(arc_mru, 0, delta, FALSE,
- ARC_BUFC_METADATA);
}
/*
@@ -2162,17 +2179,9 @@ arc_adjust(void)
adjustment = arc_size - arc_c;
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
- delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
+ if (adjustment > 0 && arc_mfu->arcs_size > 0) {
+ delta = MIN(arc_mfu->arcs_size, adjustment);
(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
- adjustment -= delta;
- }
-
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- int64_t delta = MIN(adjustment,
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
- (void) arc_evict(arc_mfu, 0, delta, FALSE,
- ARC_BUFC_METADATA);
}
/*
@@ -2265,24 +2274,61 @@ arc_do_user_evicts(void)
* This is only used to enforce the tunable arc_meta_limit, if we are
* unable to evict enough buffers notify the user via the prune callback.
*/
-void
-arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+static void
+arc_adjust_meta(void)
{
- int64_t delta;
+ int64_t adjustmnt, delta;
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ /*
+ * This slightly differs than the way we evict from the mru in
+ * arc_adjust because we don't have a "target" value (i.e. no
+ * "meta" arc_p). As a result, I think we can completely
+ * cannibalize the metadata in the MRU before we evict the
+ * metadata from the MFU. I think we probably need to implement a
+ * "metadata arc_p" value to do this properly.
+ */
+ adjustmnt = arc_meta_used - arc_meta_limit;
+
+ if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
- adjustment -= delta;
+ adjustmnt -= delta;
}
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ /*
+ * We can't afford to recalculate adjustmnt here. If we do,
+ * new metadata buffers can sneak into the MRU or ANON lists,
+ * thus penalize the MFU metadata. Although the fudge factor is
+ * small, it has been empirically shown to be significant for
+ * certain workloads (e.g. creating many empty directories). As
+ * such, we use the original calculation for adjustmnt, and
+ * simply decrement the amount of data evicted from the MRU.
+ */
+
+ if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
- adjustment -= delta;
}
- if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+ adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
+
+ if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(adjustmnt,
+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
+ arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA);
+ }
+
+ adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
+
+ if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(adjustmnt,
+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
+ arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA);
+ }
+
+ if (arc_meta_used > arc_meta_limit)
arc_do_user_prune(zfs_arc_meta_prune);
}
@@ -2341,7 +2387,13 @@ arc_shrink(uint64_t bytes)
else
arc_c = arc_c_min;
- atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
+ to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
+
+ if (arc_p > to_free)
+ atomic_add_64(&arc_p, -to_free);
+ else
+ arc_p = 0;
+
if (arc_c > arc_size)
arc_c = MAX(arc_size, arc_c_min);
if (arc_p > arc_c)
@@ -2396,7 +2448,6 @@ static void
arc_adapt_thread(void)
{
callb_cpr_t cpr;
- int64_t prune;
CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
@@ -2432,14 +2483,7 @@ arc_adapt_thread(void)
if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
arc_no_grow = FALSE;
- /*
- * Keep meta data usage within limits, arc_shrink() is not
- * used to avoid collapsing the arc_c value when only the
- * arc_meta_limit is being exceeded.
- */
- prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
- if (prune > 0)
- arc_adjust_meta(prune, B_TRUE);
+ arc_adjust_meta();
arc_adjust();
@@ -2574,8 +2618,10 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
*/
if (pages > 0) {
arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+ pages = btop(arc_evictable_memory());
} else {
arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+ pages = -1;
}
/*
@@ -2595,7 +2641,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
mutex_exit(&arc_reclaim_thr_lock);
- return (-1);
+ return (pages);
}
SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
@@ -2611,7 +2657,6 @@ static void
arc_adapt(int bytes, arc_state_t *state)
{
int mult;
- uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
if (state == arc_l2c_only)
return;
@@ -2628,18 +2673,22 @@ arc_adapt(int bytes, arc_state_t *state)
if (state == arc_mru_ghost) {
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
- mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
- arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
+ if (!zfs_arc_p_dampener_disable)
+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
+
+ arc_p = MIN(arc_c, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
uint64_t delta;
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
- mult = MIN(mult, 10);
+
+ if (!zfs_arc_p_dampener_disable)
+ mult = MIN(mult, 10);
delta = MIN(bytes * mult, arc_p);
- arc_p = MAX(arc_p_min, arc_p - delta);
+ arc_p = MAX(0, arc_p - delta);
}
ASSERT((int64_t)arc_p >= 0);
@@ -2710,6 +2759,8 @@ arc_get_data_buf(arc_buf_t *buf)
arc_state_t *state = buf->b_hdr->b_state;
uint64_t size = buf->b_hdr->b_size;
arc_buf_contents_t type = buf->b_hdr->b_type;
+ arc_buf_contents_t evict = ARC_BUFC_DATA;
+ boolean_t recycle = TRUE;
arc_adapt(size, state);
@@ -2720,12 +2771,11 @@ arc_get_data_buf(arc_buf_t *buf)
if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_DATA);
+ arc_space_consume(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
- ARCSTAT_INCR(arcstat_data_size, size);
- atomic_add_64(&arc_size, size);
+ arc_space_consume(size, ARC_SPACE_DATA);
}
goto out;
}
@@ -2750,10 +2800,27 @@ arc_get_data_buf(arc_buf_t *buf)
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
- if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
+ /*
+ * Evict data buffers prior to metadata buffers, unless we're
+ * over the metadata limit and adding a metadata buffer.
+ */
+ if (type == ARC_BUFC_METADATA) {
+ if (arc_meta_used >= arc_meta_limit)
+ evict = ARC_BUFC_METADATA;
+ else
+ /*
+ * In this case, we're evicting data while
+ * adding metadata. Thus, to prevent recycling a
+ * data buffer into a metadata buffer, recycling
+ * is disabled in the following arc_evict call.
+ */
+ recycle = FALSE;
+ }
+
+ if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_DATA);
+ arc_space_consume(size, ARC_SPACE_META);
/*
* If we are unable to recycle an existing meta buffer
@@ -2761,16 +2828,19 @@ arc_get_data_buf(arc_buf_t *buf)
* via the prune callback to drop references. The
* prune callback in run in the context of the reclaim
* thread to avoid deadlocking on the hash_lock.
+ * Of course, only do this when recycle is true.
*/
- cv_signal(&arc_reclaim_thr_cv);
+ if (recycle)
+ cv_signal(&arc_reclaim_thr_cv);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
- ARCSTAT_INCR(arcstat_data_size, size);
- atomic_add_64(&arc_size, size);
+ arc_space_consume(size, ARC_SPACE_DATA);
}
- ARCSTAT_BUMP(arcstat_recycle_miss);
+ /* Only bump this if we tried to recycle and failed */
+ if (recycle)
+ ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
out:
@@ -2790,7 +2860,8 @@ out:
* If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p
*/
- if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ if (!zfs_arc_p_aggressive_disable &&
+ arc_size < arc_c && hdr->b_state == arc_anon &&
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
arc_p = MIN(arc_c, arc_p + size);
}
@@ -4025,8 +4096,8 @@ arc_init(void)
spl_register_shrinker(&arc_shrinker);
#endif
- /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
- arc_c_min = MAX(arc_c / 4, 64<<20);
+ /* set min cache to zero */
+ arc_c_min = 4<<20;
/* set max to 1/2 of all memory */
arc_c_max = arc_c * 4;
@@ -4036,23 +4107,20 @@ arc_init(void)
*/
if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
arc_c_max = zfs_arc_max;
- if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
+ if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
arc_c_min = zfs_arc_min;
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
- /* limit meta-data to 1/4 of the arc capacity */
- arc_meta_limit = arc_c_max / 4;
+ /* limit meta-data to 3/4 of the arc capacity */
+ arc_meta_limit = (3 * arc_c_max) / 4;
arc_meta_max = 0;
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
- arc_c_min = arc_meta_limit / 2;
-
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@@ -5548,12 +5616,15 @@ MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
module_param(zfs_arc_grow_retry, int, 0644);
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+module_param(zfs_arc_p_aggressive_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
+
+module_param(zfs_arc_p_dampener_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
+
module_param(zfs_arc_shrink_shift, int, 0644);
MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
-module_param(zfs_arc_p_min_shift, int, 0644);
-MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
-
module_param(zfs_disable_dup_eviction, int, 0644);
MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");