aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/os/linux/spl/sys/shrinker.h4
-rw-r--r--include/os/linux/zfs/sys/trace_arc.h36
-rw-r--r--include/sys/arc_impl.h20
-rw-r--r--man/man5/zfs-module-parameters.540
-rw-r--r--module/os/freebsd/zfs/arc_os.c19
-rw-r--r--module/os/linux/zfs/arc_os.c216
-rw-r--r--module/zfs/arc.c252
7 files changed, 387 insertions, 200 deletions
diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h
index e519a527c..cc34d8ab1 100644
--- a/include/os/linux/spl/sys/shrinker.h
+++ b/include/os/linux/spl/sys/shrinker.h
@@ -84,7 +84,7 @@ __ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\
\
static struct shrinker varname = { \
.shrink = __ ## varname ## _wrapper, \
- .seeks = seek_cost \
+ .seeks = seek_cost, \
}
#define SHRINK_STOP (-1)
@@ -97,7 +97,7 @@ static struct shrinker varname = { \
static struct shrinker varname = { \
.count_objects = countfunc, \
.scan_objects = scanfunc, \
- .seeks = seek_cost \
+ .seeks = seek_cost, \
}
#else
diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h
index 5ce5b38a3..faf2bd3d5 100644
--- a/include/os/linux/zfs/sys/trace_arc.h
+++ b/include/os/linux/zfs/sys/trace_arc.h
@@ -354,6 +354,41 @@ DEFINE_EVENT(zfs_l2arc_evict_class, name, \
/* END CSTYLED */
DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict);
+/*
+ * Generic support for three argument tracepoints of the form:
+ *
+ * DTRACE_PROBE3(...,
+ * uint64_t, ...,
+ * uint64_t, ...,
+ * uint64_t, ...);
+ */
+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class,
+ TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
+ TP_ARGS(amount, arc_evict_count, aew_count),
+ TP_STRUCT__entry(
+ __field(uint64_t, amount)
+ __field(uint64_t, arc_evict_count)
+ __field(uint64_t, aew_count)
+ ),
+ TP_fast_assign(
+ __entry->amount = amount;
+ __entry->arc_evict_count = arc_evict_count;
+ __entry->aew_count = aew_count;
+ ),
+ TP_printk("amount %llu arc_evict_count %llu aew_count %llu",
+ __entry->amount, __entry->arc_evict_count, __entry->aew_count)
+);
+/* END CSTYLED */
+
+/* BEGIN CSTYLED */
+#define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \
+DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \
+ TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
+ TP_ARGS(amount, arc_evict_count, aew_count),
+/* END CSTYLED */
+DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction);
+
#endif /* _TRACE_ARC_H */
#undef TRACE_INCLUDE_PATH
@@ -376,6 +411,7 @@ DEFINE_DTRACE_PROBE1(l2arc__miss);
DEFINE_DTRACE_PROBE2(l2arc__read);
DEFINE_DTRACE_PROBE2(l2arc__write);
DEFINE_DTRACE_PROBE2(l2arc__iodone);
+DEFINE_DTRACE_PROBE3(arc__wait__for__eviction);
DEFINE_DTRACE_PROBE4(arc__miss);
DEFINE_DTRACE_PROBE4(l2arc__evict);
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index eb90d5bc9..bb9163ba7 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -846,15 +846,11 @@ typedef struct arc_stats {
kstat_named_t arcstat_cached_only_in_progress;
} arc_stats_t;
-typedef enum free_memory_reason_t {
- FMR_UNKNOWN,
- FMR_NEEDFREE,
- FMR_LOTSFREE,
- FMR_SWAPFS_MINFREE,
- FMR_PAGES_PP_MAXIMUM,
- FMR_HEAP_ARENA,
- FMR_ZIO_ARENA,
-} free_memory_reason_t;
+typedef struct arc_evict_waiter {
+ list_node_t aew_node;
+ kcondvar_t aew_cv;
+ uint64_t aew_count;
+} arc_evict_waiter_t;
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -870,7 +866,6 @@ typedef enum free_memory_reason_t {
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
-#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
extern taskq_t *arc_prune_taskq;
extern arc_stats_t arc_stats;
@@ -879,10 +874,6 @@ extern boolean_t arc_warm;
extern int arc_grow_retry;
extern int arc_no_grow_shift;
extern int arc_shrink_shift;
-extern zthr_t *arc_evict_zthr;
-extern kmutex_t arc_evict_lock;
-extern kcondvar_t arc_evict_waiters_cv;
-extern boolean_t arc_evict_needed;
extern kmutex_t arc_prune_mtx;
extern list_t arc_prune_list;
extern aggsum_t arc_size;
@@ -897,6 +888,7 @@ extern void arc_reduce_target_size(int64_t to_free);
extern boolean_t arc_reclaim_needed(void);
extern void arc_kmem_reap_soon(void);
extern boolean_t arc_is_overflowing(void);
+extern void arc_wait_for_eviction(uint64_t);
extern void arc_lowmem_init(void);
extern void arc_lowmem_fini(void);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index c2abd9d80..c209acbe1 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -864,6 +864,23 @@ Default value: \fB8192\fR.
.sp
.ne 2
.na
+\fBzfs_arc_eviction_pct\fR (int)
+.ad
+.RS 12n
+When \fBarc_is_overflowing()\fR, \fBarc_get_data_impl()\fR waits for this
+percent of the requested amount of data to be evicted. For example, by
+default for every 2KB that's evicted, 1KB of it may be "reused" by a new
+allocation. Since this is above 100%, it ensures that progress is made
+towards getting \fBarc_size\fR under \fBarc_c\fR. Since this is finite, it
+ensures that allocations can still happen, even during the potentially long
+time that \fBarc_size\fR is more than \fBarc_c\fR.
+.sp
+Default value: \fB200\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_arc_evict_batch_limit\fR (int)
.ad
.RS 12n
@@ -1151,6 +1168,29 @@ Default value: \fB0\fR% (disabled).
.sp
.ne 2
.na
+\fBzfs_arc_shrinker_limit\fR (int)
+.ad
+.RS 12n
+This is a limit on how many pages the ARC shrinker makes available for
+eviction in response to one page allocation attempt. Note that in
+practice, the kernel's shrinker can ask us to evict up to about 4x this
+for one allocation attempt.
+.sp
+The default limit of 10,000 (in practice, 160MB per allocation attempt with
+4K pages) limits the amount of time spent attempting to reclaim ARC memory to
+less than 100ms per allocation attempt, even with a small average compressed
+block size of ~8KB.
+.sp
+The parameter can be set to 0 (zero) to disable the limit.
+.sp
+This parameter only applies on Linux.
+.sp
+Default value: \fB10,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_arc_sys_free\fR (ulong)
.ad
.RS 12n
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 554896d85..5f4b5df4a 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -52,9 +52,6 @@ extern struct vfsops zfs_vfsops;
uint_t zfs_arc_free_target = 0;
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
-
static void
arc_free_target_init(void *unused __unused)
{
@@ -100,7 +97,6 @@ arc_available_memory(void)
{
int64_t lowest = INT64_MAX;
int64_t n __unused;
- free_memory_reason_t r = FMR_UNKNOWN;
/*
* Cooperate with pagedaemon when it's time for it to scan
@@ -109,7 +105,6 @@ arc_available_memory(void)
n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
if (n < lowest) {
lowest = n;
- r = FMR_LOTSFREE;
}
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
/*
@@ -126,13 +121,10 @@ arc_available_memory(void)
n = uma_avail() - (long)(uma_limit() / 4);
if (n < lowest) {
lowest = n;
- r = FMR_HEAP_ARENA;
}
#endif
- last_free_memory = lowest;
- last_free_reason = r;
- DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+ DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
return (lowest);
}
@@ -223,18 +215,15 @@ arc_lowmem(void *arg __unused, int howto __unused)
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
arc_reduce_target_size(to_free);
- mutex_enter(&arc_evict_lock);
- arc_evict_needed = B_TRUE;
- zthr_wakeup(arc_evict_zthr);
-
/*
* It is unsafe to block here in arbitrary threads, because we can come
* here from ARC itself and may hold ARC locks and thus risk a deadlock
* with ARC reclaim thread.
*/
if (curproc == pageproc)
- (void) cv_wait(&arc_evict_waiters_cv, &arc_evict_lock);
- mutex_exit(&arc_evict_lock);
+ arc_wait_for_eviction(to_free);
+ else
+ arc_wait_for_eviction(0);
}
void
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 9c3a6a4e2..92f9bae8c 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -57,8 +57,22 @@
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
+/*
+ * This is a limit on how many pages the ARC shrinker makes available for
+ * eviction in response to one page allocation attempt. Note that in
+ * practice, the kernel's shrinker can ask us to evict up to about 4x this
+ * for one allocation attempt.
+ *
+ * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * with 4K pages) limits the amount of time spent attempting to reclaim ARC
+ * memory to less than 100ms per allocation attempt, even with a small
+ * average compressed block size of ~8KB.
+ *
+ * See also the comment in arc_shrinker_count().
+ * Set to 0 to disable limit.
+ */
+int zfs_arc_shrinker_limit = 10000;
+
/*
* Return a default max arc size based on the amount of physical memory.
@@ -105,16 +119,6 @@ arc_free_memory(void)
}
/*
- * Additional reserve of pages for pp_reserve.
- */
-int64_t arc_pages_pp_reserve = 64;
-
-/*
- * Additional reserve of pages for swapfs.
- */
-int64_t arc_swapfs_reserve = 64;
-
-/*
* Return the amount of memory that can be consumed before reclaim will be
* needed. Positive if there is sufficient free memory, negative indicates
* the amount of memory that needs to be freed up.
@@ -122,25 +126,7 @@ int64_t arc_swapfs_reserve = 64;
int64_t
arc_available_memory(void)
{
- int64_t lowest = INT64_MAX;
- free_memory_reason_t r = FMR_UNKNOWN;
- int64_t n;
-
- if (arc_need_free > 0) {
- lowest = -arc_need_free;
- r = FMR_NEEDFREE;
- }
-
- n = arc_free_memory() - arc_sys_free - arc_need_free;
- if (n < lowest) {
- lowest = n;
- r = FMR_LOTSFREE;
- }
-
- last_free_memory = lowest;
- last_free_reason = r;
-
- return (lowest);
+ return (arc_free_memory() - arc_sys_free);
}
static uint64_t
@@ -174,84 +160,84 @@ arc_evictable_memory(void)
static unsigned long
arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
- return (btop((int64_t)arc_evictable_memory()));
+ /*
+ * __GFP_FS won't be set if we are called from ZFS code (see
+ * kmem_flags_convert(), which removes it). To avoid a deadlock, we
+ * don't allow evicting in this case. We return 0 rather than
+ * SHRINK_STOP so that the shrinker logic doesn't accumulate a
+ * deficit against us.
+ */
+ if (!(sc->gfp_mask & __GFP_FS)) {
+ return (0);
+ }
+
+ /*
+ * This code is reached in the "direct reclaim" case, where the
+ * kernel (outside ZFS) is trying to allocate a page, and the system
+ * is low on memory.
+ *
+ * The kernel's shrinker code doesn't understand how many pages the
+ * ARC's callback actually frees, so it may ask the ARC to shrink a
+ * lot for one page allocation. This is problematic because it may
+ * take a long time, thus delaying the page allocation, and because
+ * it may force the ARC to unnecessarily shrink very small.
+ *
+ * Therefore, we limit the amount of data that we say is evictable,
+ * which limits the amount that the shrinker will ask us to evict for
+ * one page allocation attempt.
+ *
+ * In practice, we may be asked to shrink 4x the limit to satisfy one
+ * page allocation, before the kernel's shrinker code gives up on us.
+ * When that happens, we rely on the kernel code to find the pages
+ * that we freed before invoking the OOM killer. This happens in
+ * __alloc_pages_slowpath(), which retries and finds the pages we
+ * freed when it calls get_page_from_freelist().
+ *
+ * See also the comment above zfs_arc_shrinker_limit.
+ */
+ int64_t limit = zfs_arc_shrinker_limit != 0 ?
+ zfs_arc_shrinker_limit : INT64_MAX;
+ return (MIN(limit, btop((int64_t)arc_evictable_memory())));
}
static unsigned long
arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int64_t pages;
+ ASSERT((sc->gfp_mask & __GFP_FS) != 0);
/* The arc is considered warm once reclaim has occurred */
if (unlikely(arc_warm == B_FALSE))
arc_warm = B_TRUE;
- /* Return the potential number of reclaimable pages */
- pages = btop((int64_t)arc_evictable_memory());
-
- /* Not allowed to perform filesystem reclaim */
- if (!(sc->gfp_mask & __GFP_FS))
- return (SHRINK_STOP);
-
- /* Reclaim in progress */
- if (mutex_tryenter(&arc_evict_lock) == 0) {
- ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan));
- return (0);
- }
-
- mutex_exit(&arc_evict_lock);
+ /*
+ * Evict the requested number of pages by reducing arc_c and waiting
+ * for the requested amount of data to be evicted.
+ */
+ arc_reduce_target_size(ptob(sc->nr_to_scan));
+ arc_wait_for_eviction(ptob(sc->nr_to_scan));
+ if (current->reclaim_state != NULL)
+ current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
/*
- * Evict the requested number of pages by shrinking arc_c the
- * requested amount.
+ * We are experiencing memory pressure which the arc_evict_zthr was
+ * unable to keep up with. Set arc_no_grow to briefly pause arc
+ * growth to avoid compounding the memory pressure.
*/
- if (pages > 0) {
- arc_reduce_target_size(ptob(sc->nr_to_scan));
-
- /*
- * Repeated calls to the arc shrinker can reduce arc_c
- * drastically, potentially all the way to arc_c_min. While
- * arc_c is below arc_size, ZFS can't process read/write
- * requests, because arc_get_data_impl() will block. To
- * ensure that arc_c doesn't shrink faster than the evict
- * thread can keep up, we wait for eviction here.
- */
- mutex_enter(&arc_evict_lock);
- if (arc_is_overflowing()) {
- arc_evict_needed = B_TRUE;
- zthr_wakeup(arc_evict_zthr);
- (void) cv_wait(&arc_evict_waiters_cv,
- &arc_evict_lock);
- }
- mutex_exit(&arc_evict_lock);
-
- if (current_is_kswapd())
- arc_kmem_reap_soon();
- pages = MAX((int64_t)pages -
- (int64_t)btop(arc_evictable_memory()), 0);
- /*
- * We've shrunk what we can, wake up threads.
- */
- cv_broadcast(&arc_evict_waiters_cv);
- } else
- pages = SHRINK_STOP;
+ arc_no_grow = B_TRUE;
/*
* When direct reclaim is observed it usually indicates a rapid
* increase in memory pressure. This occurs because the kswapd
* threads were unable to asynchronously keep enough free memory
- * available. In this case set arc_no_grow to briefly pause arc
- * growth to avoid compounding the memory pressure.
+ * available.
*/
if (current_is_kswapd()) {
ARCSTAT_BUMP(arcstat_memory_indirect_count);
} else {
- arc_no_grow = B_TRUE;
- arc_kmem_reap_soon();
ARCSTAT_BUMP(arcstat_memory_direct_count);
}
- return (pages);
+ return (sc->nr_to_scan);
}
SPL_SHRINKER_DECLARE(arc_shrinker,
@@ -305,9 +291,56 @@ arc_lowmem_init(void)
*/
spl_register_shrinker(&arc_shrinker);
- /* Set to 1/64 of all memory or a minimum of 512K */
- arc_sys_free = MAX(allmem / 64, (512 * 1024));
- arc_need_free = 0;
+ /*
+ * The ARC tries to keep at least this much memory available for the
+ * system. This gives the ARC time to shrink in response to memory
+ * pressure, before running completely out of memory and invoking the
+ * direct-reclaim ARC shrinker.
+ *
+ * This should be more than twice high_wmark_pages(), so that
+ * arc_wait_for_eviction() will wait until at least the
+ * high_wmark_pages() are free (see arc_evict_state_impl()).
+ *
+ * Note: Even when the system is very low on memory, the kernel's
+ * shrinker code may only ask for one "batch" of pages (512KB) to be
+ * evicted. If concurrent allocations consume these pages, there may
+ * still be insufficient free pages, and the OOM killer takes action.
+ *
+ * By setting arc_sys_free large enough, and having
+ * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
+ * free memory, it is much less likely that concurrent allocations can
+ * consume all the memory that was evicted before checking for
+ * OOM.
+ *
+ * It's hard to iterate the zones from a linux kernel module, which
+ * makes it difficult to determine the watermark dynamically. Instead
+ * we compute the maximum high watermark for this system, based
+ * on the amount of memory, assuming default parameters on Linux kernel
+ * 5.3.
+ */
+
+ /*
+ * Base wmark_low is 4 * the square root of Kbytes of RAM.
+ */
+ long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+
+ /*
+ * Clamp to between 128K and 64MB.
+ */
+ wmark = MAX(wmark, 128 * 1024);
+ wmark = MIN(wmark, 64 * 1024 * 1024);
+
+ /*
+ * watermark_boost can increase the wmark by up to 150%.
+ */
+ wmark += wmark * 150 / 100;
+
+ /*
+ * arc_sys_free needs to be more than 2x the watermark, because
+ * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up
+ * to 3x to ensure we're above it.
+ */
+ arc_sys_free = wmark * 3 + allmem / 32;
}
void
@@ -348,15 +381,11 @@ int64_t
arc_available_memory(void)
{
int64_t lowest = INT64_MAX;
- free_memory_reason_t r = FMR_UNKNOWN;
/* Every 100 calls, free a small amount */
if (spa_get_random(100) == 0)
lowest = -1024;
- last_free_memory = lowest;
- last_free_reason = r;
-
return (lowest);
}
@@ -429,3 +458,8 @@ arc_prune_async(int64_t adjust)
}
mutex_exit(&arc_prune_mtx);
}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
+ "Limit on number of pages that ARC shrinker can reclaim at once");
+/* END CSTYLED */
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index dcf710ad1..3ec98917d 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -313,17 +313,38 @@ boolean_t arc_watch = B_FALSE;
* calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
* arc_available_memory().
*/
-static zthr_t *arc_reap_zthr;
+static zthr_t *arc_reap_zthr;
/*
* This thread's job is to keep arc_size under arc_c, by calling
* arc_evict(), which improves arc_is_overflowing().
*/
-zthr_t *arc_evict_zthr;
+static zthr_t *arc_evict_zthr;
-kmutex_t arc_evict_lock;
-kcondvar_t arc_evict_waiters_cv;
-boolean_t arc_evict_needed = B_FALSE;
+static kmutex_t arc_evict_lock;
+static boolean_t arc_evict_needed = B_FALSE;
+
+/*
+ * Count of bytes evicted since boot.
+ */
+static uint64_t arc_evict_count;
+
+/*
+ * List of arc_evict_waiter_t's, representing threads waiting for the
+ * arc_evict_count to reach specific values.
+ */
+static list_t arc_evict_waiters;
+
+/*
+ * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
+ * the requested amount of data to be evicted. For example, by default for
+ * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
+ * Since this is above 100%, it ensures that progress is made towards getting
+ * arc_size under arc_c. Since this is finite, it ensures that allocations
+ * can still happen, even during the potentially long time that arc_size is
+ * more than arc_c.
+ */
+int zfs_arc_eviction_pct = 200;
/*
* The number of headers to evict in arc_evict_state_impl() before
@@ -632,6 +653,7 @@ arc_state_t *arc_mfu;
#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit)
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */
/* size of all b_rabd's in entire arc */
#define arc_raw_size ARCSTAT(arcstat_raw_size)
@@ -3859,6 +3881,20 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
return (bytes_evicted);
}
+static void
+arc_set_need_free(void)
+{
+ ASSERT(MUTEX_HELD(&arc_evict_lock));
+ int64_t remaining = arc_free_memory() - arc_sys_free / 2;
+ arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
+ if (aw == NULL) {
+ arc_need_free = MAX(-remaining, 0);
+ } else {
+ arc_need_free =
+ MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
+ }
+}
+
static uint64_t
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, int64_t bytes)
@@ -3938,29 +3974,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
if (evicted != 0)
evict_count++;
- /*
- * If arc_size isn't overflowing, signal any
- * threads that might happen to be waiting.
- *
- * For each header evicted, we wake up a single
- * thread. If we used cv_broadcast, we could
- * wake up "too many" threads causing arc_size
- * to significantly overflow arc_c; since
- * arc_get_data_impl() doesn't check for overflow
- * when it's woken up (it doesn't because it's
- * possible for the ARC to be overflowing while
- * full of un-evictable buffers, and the
- * function should proceed in this case).
- *
- * If threads are left sleeping, due to not
- * using cv_broadcast here, they will be woken
- * up via cv_broadcast in arc_evict_cb() just
- * before arc_evict_zthr sleeps.
- */
- mutex_enter(&arc_evict_lock);
- if (!arc_is_overflowing())
- cv_signal(&arc_evict_waiters_cv);
- mutex_exit(&arc_evict_lock);
} else {
ARCSTAT_BUMP(arcstat_mutex_miss);
}
@@ -3969,6 +3982,32 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
multilist_sublist_unlock(mls);
/*
+ * Increment the count of evicted bytes, and wake up any threads that
+ * are waiting for the count to reach this value. Since the list is
+ * ordered by ascending aew_count, we pop off the beginning of the
+ * list until we reach the end, or a waiter that's past the current
+ * "count". Doing this outside the loop reduces the number of times
+ * we need to acquire the global arc_evict_lock.
+ *
+ * Only wake when there's sufficient free memory in the system
+ * (specifically, arc_sys_free/2, which by default is a bit more than
+ * 1/64th of RAM). See the comments in arc_wait_for_eviction().
+ */
+ mutex_enter(&arc_evict_lock);
+ arc_evict_count += bytes_evicted;
+
+ if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) {
+ arc_evict_waiter_t *aw;
+ while ((aw = list_head(&arc_evict_waiters)) != NULL &&
+ aw->aew_count <= arc_evict_count) {
+ list_remove(&arc_evict_waiters, aw);
+ cv_broadcast(&aw->aew_cv);
+ }
+ }
+ arc_set_need_free();
+ mutex_exit(&arc_evict_lock);
+
+ /*
* If the ARC size is reduced from arc_c_max to arc_c_min (especially
* if the average cached block is small), eviction can be on-CPU for
* many seconds. To ensure that other threads that may be bound to
@@ -4582,7 +4621,16 @@ void
arc_reduce_target_size(int64_t to_free)
{
uint64_t asize = aggsum_value(&arc_size);
- uint64_t c = arc_c;
+
+ /*
+ * All callers want the ARC to actually evict (at least) this much
+ * memory. Therefore we reduce from the lower of the current size and
+ * the target size. This way, even if arc_c is much higher than
+ * arc_size (as can be the case after many calls to arc_freed(), we will
+ * immediately have arc_c < arc_size and therefore the arc_evict_zthr
+ * will evict.
+ */
+ uint64_t c = MIN(arc_c, asize);
if (c > to_free && c - to_free > arc_c_min) {
arc_c = c - to_free;
@@ -4693,18 +4741,18 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
arc_ksp->ks_update(arc_ksp, KSTAT_READ);
/*
- * We have to rely on arc_get_data_impl() to tell us when to evict,
- * rather than checking if we are overflowing here, so that we are
- * sure to not leave arc_get_data_impl() waiting on
- * arc_evict_waiters_cv. If we have become "not overflowing" since
- * arc_get_data_impl() checked, we need to wake it up. We could
- * broadcast the CV here, but arc_get_data_impl() may have not yet
- * gone to sleep. We would need to use a mutex to ensure that this
- * function doesn't broadcast until arc_get_data_impl() has gone to
- * sleep (e.g. the arc_evict_lock). However, the lock ordering of
- * such a lock would necessarily be incorrect with respect to the
- * zthr_lock, which is held before this function is called, and is
- * held by arc_get_data_impl() when it calls zthr_wakeup().
+ * We have to rely on arc_wait_for_eviction() to tell us when to
+ * evict, rather than checking if we are overflowing here, so that we
+ * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
+ * If we have become "not overflowing" since arc_wait_for_eviction()
+ * checked, we need to wake it up. We could broadcast the CV here,
+ * but arc_wait_for_eviction() may have not yet gone to sleep. We
+ * would need to use a mutex to ensure that this function doesn't
+ * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
+ * the arc_evict_lock). However, the lock ordering of such a lock
+ * would necessarily be incorrect with respect to the zthr_lock,
+ * which is held before this function is called, and is held by
+ * arc_wait_for_eviction() when it calls zthr_wakeup().
*/
return (arc_evict_needed);
}
@@ -4743,8 +4791,11 @@ arc_evict_cb(void *arg, zthr_t *zthr)
* can't evict anything more, so we should wake
* arc_get_data_impl() sooner.
*/
- cv_broadcast(&arc_evict_waiters_cv);
- arc_need_free = 0;
+ arc_evict_waiter_t *aw;
+ while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
+ cv_broadcast(&aw->aew_cv);
+ }
+ arc_set_need_free();
}
mutex_exit(&arc_evict_lock);
spl_fstrans_unmark(cookie);
@@ -4824,9 +4875,6 @@ arc_reap_cb(void *arg, zthr_t *zthr)
int64_t to_free =
(arc_c >> arc_shrink_shift) - free_memory;
if (to_free > 0) {
-#ifdef _KERNEL
- to_free = MAX(to_free, arc_need_free);
-#endif
arc_reduce_target_size(to_free);
}
spl_fstrans_unmark(cookie);
@@ -5008,6 +5056,64 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
}
/*
+ * Wait for the specified amount of data (in bytes) to be evicted from the
+ * ARC, and for there to be sufficient free memory in the system. Waiting for
+ * eviction ensures that the memory used by the ARC decreases. Waiting for
+ * free memory ensures that the system won't run out of free pages, regardless
+ * of ARC behavior and settings. See arc_lowmem_init().
+ */
+void
+arc_wait_for_eviction(uint64_t amount)
+{
+ mutex_enter(&arc_evict_lock);
+ if (arc_is_overflowing()) {
+ arc_evict_needed = B_TRUE;
+ zthr_wakeup(arc_evict_zthr);
+
+ if (amount != 0) {
+ arc_evict_waiter_t aw;
+ list_link_init(&aw.aew_node);
+ cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
+
+ arc_evict_waiter_t *last =
+ list_tail(&arc_evict_waiters);
+ if (last != NULL) {
+ ASSERT3U(last->aew_count, >, arc_evict_count);
+ aw.aew_count = last->aew_count + amount;
+ } else {
+ aw.aew_count = arc_evict_count + amount;
+ }
+
+ list_insert_tail(&arc_evict_waiters, &aw);
+
+ arc_set_need_free();
+
+ DTRACE_PROBE3(arc__wait__for__eviction,
+ uint64_t, amount,
+ uint64_t, arc_evict_count,
+ uint64_t, aw.aew_count);
+
+ /*
+ * We will be woken up either when arc_evict_count
+ * reaches aew_count, or when the ARC is no longer
+ * overflowing and eviction completes.
+ */
+ cv_wait(&aw.aew_cv, &arc_evict_lock);
+
+ /*
+ * In case of "false" wakeup, we will still be on the
+ * list.
+ */
+ if (list_link_active(&aw.aew_node))
+ list_remove(&arc_evict_waiters, &aw);
+
+ cv_destroy(&aw.aew_cv);
+ }
+ }
+ mutex_exit(&arc_evict_lock);
+}
+
+/*
* Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard
@@ -5022,40 +5128,26 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
arc_adapt(size, state);
/*
- * If arc_size is currently overflowing, and has grown past our
- * upper limit, we must be adding data faster than the evict
- * thread can evict. Thus, to ensure we don't compound the
+ * If arc_size is currently overflowing, we must be adding data
+ * faster than we are evicting. To ensure we don't compound the
* problem by adding more data and forcing arc_size to grow even
- * further past it's target size, we halt and wait for the
- * eviction thread to catch up.
+ * further past it's target size, we wait for the eviction thread to
+ * make some progress. We also wait for there to be sufficient free
+ * memory in the system, as measured by arc_free_memory().
+ *
+ * Specifically, we wait for zfs_arc_eviction_pct percent of the
+ * requested size to be evicted. This should be more than 100%, to
+ * ensure that that progress is also made towards getting arc_size
+ * under arc_c. See the comment above zfs_arc_eviction_pct.
*
- * It's also possible that the reclaim thread is unable to evict
- * enough buffers to get arc_size below the overflow limit (e.g.
- * due to buffers being un-evictable, or hash lock collisions).
- * In this case, we want to proceed regardless if we're
- * overflowing; thus we don't use a while loop here.
+ * We do the overflowing check without holding the arc_evict_lock to
+ * reduce lock contention in this hot path. Note that
+ * arc_wait_for_eviction() will acquire the lock and check again to
+ * ensure we are truly overflowing before blocking.
*/
if (arc_is_overflowing()) {
- mutex_enter(&arc_evict_lock);
-
- /*
- * Now that we've acquired the lock, we may no longer be
- * over the overflow limit, lets check.
- *
- * We're ignoring the case of spurious wake ups. If that
- * were to happen, it'd let this thread consume an ARC
- * buffer before it should have (i.e. before we're under
- * the overflow limit and were signalled by the reclaim
- * thread). As long as that is a rare occurrence, it
- * shouldn't cause any harm.
- */
- if (arc_is_overflowing()) {
- arc_evict_needed = B_TRUE;
- zthr_wakeup(arc_evict_zthr);
- (void) cv_wait(&arc_evict_waiters_cv,
- &arc_evict_lock);
- }
- mutex_exit(&arc_evict_lock);
+ arc_wait_for_eviction(size *
+ zfs_arc_eviction_pct / 100);
}
VERIFY3U(hdr->b_type, ==, type);
@@ -7269,7 +7361,8 @@ arc_init(void)
{
uint64_t percent, allmem = arc_all_memory();
mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_evict_waiters_cv, NULL, CV_DEFAULT, NULL);
+ list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
+ offsetof(arc_evict_waiter_t, aew_node));
arc_min_prefetch_ms = 1000;
arc_min_prescient_prefetch_ms = 6000;
@@ -7402,7 +7495,7 @@ arc_fini(void)
(void) zthr_cancel(arc_reap_zthr);
mutex_destroy(&arc_evict_lock);
- cv_destroy(&arc_evict_waiters_cv);
+ list_destroy(&arc_evict_waiters);
/*
* buf_fini() must proceed arc_state_fini() because buf_fin() may
@@ -10357,4 +10450,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
"Percentage of excess dnodes to try to unpin");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+ "When full, ARC allocation waits for eviction of this % of alloc size");
/* END CSTYLED */