aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/arc.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r--module/zfs/arc.c188
1 files changed, 158 insertions, 30 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f71c24844..95d14a9e7 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -104,6 +104,14 @@
* protected from simultaneous callbacks from arc_buf_evict()
* and arc_do_user_evicts().
*
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted. In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored. For example,
+ * when using the ZPL each dentry holds a references on a znode. These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
@@ -120,14 +128,13 @@
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
-#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
#include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -141,8 +148,8 @@ extern int zfs_write_limit_shift;
extern uint64_t zfs_write_limit_max;
extern kmutex_t zfs_write_limit_lock;
-#define ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
typedef enum arc_reclaim_strategy {
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
@@ -180,7 +187,7 @@ unsigned long zfs_arc_meta_limit = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
-int zfs_arc_reduce_dnlc_percent = 0;
+int zfs_arc_meta_prune = 0;
/*
* Note that buffers can be in one of 6 states:
@@ -288,6 +295,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_no_grow;
kstat_named_t arcstat_tempreserve;
kstat_named_t arcstat_loaned_bytes;
+ kstat_named_t arcstat_prune;
kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
@@ -352,6 +360,7 @@ static arc_stats_t arc_stats = {
{ "arc_no_grow", KSTAT_DATA_UINT64 },
{ "arc_tempreserve", KSTAT_DATA_UINT64 },
{ "arc_loaned_bytes", KSTAT_DATA_UINT64 },
+ { "arc_prune", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
@@ -481,6 +490,8 @@ struct arc_buf_hdr {
list_node_t b_l2node;
};
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
static arc_buf_t *arc_eviction_list;
static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
@@ -1925,6 +1936,48 @@ arc_adjust(void)
}
}
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache. This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers. (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+ arc_prune_func_t *func;
+ void *private;
+ arc_prune_t *cp, *np;
+
+ mutex_enter(&arc_prune_mtx);
+
+ cp = list_head(&arc_prune_list);
+ while (cp != NULL) {
+ func = cp->p_pfunc;
+ private = cp->p_private;
+ np = list_next(&arc_prune_list, cp);
+ refcount_add(&cp->p_refcnt, func);
+ mutex_exit(&arc_prune_mtx);
+
+ if (func != NULL)
+ func(adjustment, private);
+
+ mutex_enter(&arc_prune_mtx);
+
+ /* User removed prune callback concurrently with execution */
+ if (refcount_remove(&cp->p_refcnt, func) == 0) {
+ ASSERT(!list_link_active(&cp->p_node));
+ refcount_destroy(&cp->p_refcnt);
+ kmem_free(cp, sizeof (*cp));
+ }
+
+ cp = np;
+ }
+
+ ARCSTAT_BUMP(arcstat_prune);
+ mutex_exit(&arc_prune_mtx);
+}
+
static void
arc_do_user_evicts(void)
{
@@ -1949,6 +2002,32 @@ arc_do_user_evicts(void)
}
/*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+ int64_t delta;
+
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+ adjustment -= delta;
+ }
+
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+ adjustment -= delta;
+ }
+
+ if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+ arc_do_user_prune(arc_meta_prune);
+}
+
+/*
* Flush all *evictable* data from the cache for the given spa.
* NOTE: this will not touch "active" (i.e. referenced) data.
*/
@@ -2085,24 +2164,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
kmem_cache_t *prev_data_cache = NULL;
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
-#ifdef _KERNEL
- int retry = 0;
-
- while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
- /*
- * We are exceeding our meta-data cache limit.
- * Purge some DNLC entries to release holds on meta-data.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
- retry++;
- }
-#if defined(__i386)
- /*
- * Reclaim unused memory from all kmem caches.
- */
- kmem_reap();
-#endif
-#endif
/*
* An aggressive reclamation will shrink the cache size as well as
@@ -2121,6 +2182,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
+
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_cache);
}
@@ -2131,6 +2193,7 @@ arc_reclaim_thread(void)
clock_t growtime = 0;
arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
callb_cpr_t cpr;
+ int64_t prune;
CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
@@ -2160,9 +2223,14 @@ arc_reclaim_thread(void)
arc_no_grow = FALSE;
}
- /* Keep meta data usage within limits */
- if (arc_meta_used >= arc_meta_limit)
- arc_kmem_reap_now(ARC_RECLAIM_CONS);
+ /*
+ * Keep meta data usage within limits, arc_shrink() is not
+ * used to avoid collapsing the arc_c value when only the
+ * arc_meta_limit is being exceeded.
+ */
+ prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+ if (prune > 0)
+ arc_adjust_meta(prune, B_TRUE);
arc_adjust();
@@ -2399,16 +2467,27 @@ arc_get_data_buf(arc_buf_t *buf)
state = (arc_mru->arcs_lsize[type] >= size &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
+
if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA);
+
+ /*
+ * If we are unable to recycle an existing meta buffer
+ * signal the reclaim thread. It will notify users
+ * via the prune callback to drop references. The
+ * prune callback in run in the context of the reclaim
+ * thread to avoid deadlocking on the hash_lock.
+ */
+ cv_signal(&arc_reclaim_thr_cv);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
+
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
@@ -3021,6 +3100,37 @@ top:
return (0);
}
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+ arc_prune_t *p;
+
+ p = kmem_alloc(sizeof(*p), KM_SLEEP);
+ p->p_pfunc = func;
+ p->p_private = private;
+ list_link_init(&p->p_node);
+ refcount_create(&p->p_refcnt);
+
+ mutex_enter(&arc_prune_mtx);
+ refcount_add(&p->p_refcnt, &arc_prune_list);
+ list_insert_head(&arc_prune_list, p);
+ mutex_exit(&arc_prune_mtx);
+
+ return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+ mutex_enter(&arc_prune_mtx);
+ list_remove(&arc_prune_list, p);
+ if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+ refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+}
+
void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
@@ -3598,8 +3708,8 @@ arc_init(void)
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
- if (zfs_arc_reduce_dnlc_percent > 0)
- arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+ if (zfs_arc_meta_prune > 0)
+ arc_meta_prune = zfs_arc_meta_prune;
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
@@ -3646,7 +3756,10 @@ arc_init(void)
buf_init();
arc_thread_exit = 0;
+ list_create(&arc_prune_list, sizeof (arc_prune_t),
+ offsetof(arc_prune_t, p_node));
arc_eviction_list = NULL;
+ mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
@@ -3674,6 +3787,8 @@ arc_init(void)
void
arc_fini(void)
{
+ arc_prune_t *p;
+
mutex_enter(&arc_reclaim_thr_lock);
#ifdef _KERNEL
spl_unregister_shrinker(&arc_shrinker);
@@ -3693,6 +3808,17 @@ arc_fini(void)
arc_ksp = NULL;
}
+ mutex_enter(&arc_prune_mtx);
+ while ((p = list_head(&arc_prune_list)) != NULL) {
+ list_remove(&arc_prune_list, p);
+ refcount_remove(&p->p_refcnt, &arc_prune_list);
+ refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+
+ list_destroy(&arc_prune_list);
+ mutex_destroy(&arc_prune_mtx);
mutex_destroy(&arc_eviction_mtx);
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
@@ -4774,6 +4900,8 @@ l2arc_stop(void)
EXPORT_SYMBOL(arc_read);
EXPORT_SYMBOL(arc_buf_remove_ref);
EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
module_param(zfs_arc_min, ulong, 0444);
MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
@@ -4784,8 +4912,8 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
module_param(zfs_arc_meta_limit, ulong, 0444);
MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
-module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
-MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
module_param(zfs_arc_grow_retry, int, 0444);
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");