summaryrefslogtreecommitdiffstats
path: root/module/os
diff options
context:
space:
mode:
Diffstat (limited to 'module/os')
-rw-r--r--module/os/freebsd/zfs/arc_os.c19
-rw-r--r--module/os/linux/zfs/arc_os.c216
2 files changed, 129 insertions, 106 deletions
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 554896d85..5f4b5df4a 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -52,9 +52,6 @@ extern struct vfsops zfs_vfsops;
uint_t zfs_arc_free_target = 0;
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
-
static void
arc_free_target_init(void *unused __unused)
{
@@ -100,7 +97,6 @@ arc_available_memory(void)
{
int64_t lowest = INT64_MAX;
int64_t n __unused;
- free_memory_reason_t r = FMR_UNKNOWN;
/*
* Cooperate with pagedaemon when it's time for it to scan
@@ -109,7 +105,6 @@ arc_available_memory(void)
n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
if (n < lowest) {
lowest = n;
- r = FMR_LOTSFREE;
}
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
/*
@@ -126,13 +121,10 @@ arc_available_memory(void)
n = uma_avail() - (long)(uma_limit() / 4);
if (n < lowest) {
lowest = n;
- r = FMR_HEAP_ARENA;
}
#endif
- last_free_memory = lowest;
- last_free_reason = r;
- DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+ DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
return (lowest);
}
@@ -223,18 +215,15 @@ arc_lowmem(void *arg __unused, int howto __unused)
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
arc_reduce_target_size(to_free);
- mutex_enter(&arc_evict_lock);
- arc_evict_needed = B_TRUE;
- zthr_wakeup(arc_evict_zthr);
-
/*
* It is unsafe to block here in arbitrary threads, because we can come
* here from ARC itself and may hold ARC locks and thus risk a deadlock
* with ARC reclaim thread.
*/
if (curproc == pageproc)
- (void) cv_wait(&arc_evict_waiters_cv, &arc_evict_lock);
- mutex_exit(&arc_evict_lock);
+ arc_wait_for_eviction(to_free);
+ else
+ arc_wait_for_eviction(0);
}
void
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 9c3a6a4e2..92f9bae8c 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -57,8 +57,22 @@
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
+/*
+ * This is a limit on how many pages the ARC shrinker makes available for
+ * eviction in response to one page allocation attempt. Note that in
+ * practice, the kernel's shrinker can ask us to evict up to about 4x this
+ * for one allocation attempt.
+ *
+ * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * with 4K pages) limits the amount of time spent attempting to reclaim ARC
+ * memory to less than 100ms per allocation attempt, even with a small
+ * average compressed block size of ~8KB.
+ *
+ * See also the comment in arc_shrinker_count().
+ * Set to 0 to disable limit.
+ */
+int zfs_arc_shrinker_limit = 10000;
+
/*
* Return a default max arc size based on the amount of physical memory.
@@ -105,16 +119,6 @@ arc_free_memory(void)
}
/*
- * Additional reserve of pages for pp_reserve.
- */
-int64_t arc_pages_pp_reserve = 64;
-
-/*
- * Additional reserve of pages for swapfs.
- */
-int64_t arc_swapfs_reserve = 64;
-
-/*
* Return the amount of memory that can be consumed before reclaim will be
* needed. Positive if there is sufficient free memory, negative indicates
* the amount of memory that needs to be freed up.
@@ -122,25 +126,7 @@ int64_t arc_swapfs_reserve = 64;
int64_t
arc_available_memory(void)
{
- int64_t lowest = INT64_MAX;
- free_memory_reason_t r = FMR_UNKNOWN;
- int64_t n;
-
- if (arc_need_free > 0) {
- lowest = -arc_need_free;
- r = FMR_NEEDFREE;
- }
-
- n = arc_free_memory() - arc_sys_free - arc_need_free;
- if (n < lowest) {
- lowest = n;
- r = FMR_LOTSFREE;
- }
-
- last_free_memory = lowest;
- last_free_reason = r;
-
- return (lowest);
+ return (arc_free_memory() - arc_sys_free);
}
static uint64_t
@@ -174,84 +160,84 @@ arc_evictable_memory(void)
static unsigned long
arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
- return (btop((int64_t)arc_evictable_memory()));
+ /*
+ * __GFP_FS won't be set if we are called from ZFS code (see
+ * kmem_flags_convert(), which removes it). To avoid a deadlock, we
+ * don't allow evicting in this case. We return 0 rather than
+ * SHRINK_STOP so that the shrinker logic doesn't accumulate a
+ * deficit against us.
+ */
+ if (!(sc->gfp_mask & __GFP_FS)) {
+ return (0);
+ }
+
+ /*
+ * This code is reached in the "direct reclaim" case, where the
+ * kernel (outside ZFS) is trying to allocate a page, and the system
+ * is low on memory.
+ *
+ * The kernel's shrinker code doesn't understand how many pages the
+ * ARC's callback actually frees, so it may ask the ARC to shrink a
+ * lot for one page allocation. This is problematic because it may
+ * take a long time, thus delaying the page allocation, and because
+ * it may force the ARC to unnecessarily shrink very small.
+ *
+ * Therefore, we limit the amount of data that we say is evictable,
+ * which limits the amount that the shrinker will ask us to evict for
+ * one page allocation attempt.
+ *
+ * In practice, we may be asked to shrink 4x the limit to satisfy one
+ * page allocation, before the kernel's shrinker code gives up on us.
+ * When that happens, we rely on the kernel code to find the pages
+ * that we freed before invoking the OOM killer. This happens in
+ * __alloc_pages_slowpath(), which retries and finds the pages we
+ * freed when it calls get_page_from_freelist().
+ *
+ * See also the comment above zfs_arc_shrinker_limit.
+ */
+ int64_t limit = zfs_arc_shrinker_limit != 0 ?
+ zfs_arc_shrinker_limit : INT64_MAX;
+ return (MIN(limit, btop((int64_t)arc_evictable_memory())));
}
static unsigned long
arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int64_t pages;
+ ASSERT((sc->gfp_mask & __GFP_FS) != 0);
/* The arc is considered warm once reclaim has occurred */
if (unlikely(arc_warm == B_FALSE))
arc_warm = B_TRUE;
- /* Return the potential number of reclaimable pages */
- pages = btop((int64_t)arc_evictable_memory());
-
- /* Not allowed to perform filesystem reclaim */
- if (!(sc->gfp_mask & __GFP_FS))
- return (SHRINK_STOP);
-
- /* Reclaim in progress */
- if (mutex_tryenter(&arc_evict_lock) == 0) {
- ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan));
- return (0);
- }
-
- mutex_exit(&arc_evict_lock);
+ /*
+ * Evict the requested number of pages by reducing arc_c and waiting
+ * for the requested amount of data to be evicted.
+ */
+ arc_reduce_target_size(ptob(sc->nr_to_scan));
+ arc_wait_for_eviction(ptob(sc->nr_to_scan));
+ if (current->reclaim_state != NULL)
+ current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
/*
- * Evict the requested number of pages by shrinking arc_c the
- * requested amount.
+ * We are experiencing memory pressure which the arc_evict_zthr was
+ * unable to keep up with. Set arc_no_grow to briefly pause arc
+ * growth to avoid compounding the memory pressure.
*/
- if (pages > 0) {
- arc_reduce_target_size(ptob(sc->nr_to_scan));
-
- /*
- * Repeated calls to the arc shrinker can reduce arc_c
- * drastically, potentially all the way to arc_c_min. While
- * arc_c is below arc_size, ZFS can't process read/write
- * requests, because arc_get_data_impl() will block. To
- * ensure that arc_c doesn't shrink faster than the evict
- * thread can keep up, we wait for eviction here.
- */
- mutex_enter(&arc_evict_lock);
- if (arc_is_overflowing()) {
- arc_evict_needed = B_TRUE;
- zthr_wakeup(arc_evict_zthr);
- (void) cv_wait(&arc_evict_waiters_cv,
- &arc_evict_lock);
- }
- mutex_exit(&arc_evict_lock);
-
- if (current_is_kswapd())
- arc_kmem_reap_soon();
- pages = MAX((int64_t)pages -
- (int64_t)btop(arc_evictable_memory()), 0);
- /*
- * We've shrunk what we can, wake up threads.
- */
- cv_broadcast(&arc_evict_waiters_cv);
- } else
- pages = SHRINK_STOP;
+ arc_no_grow = B_TRUE;
/*
* When direct reclaim is observed it usually indicates a rapid
* increase in memory pressure. This occurs because the kswapd
* threads were unable to asynchronously keep enough free memory
- * available. In this case set arc_no_grow to briefly pause arc
- * growth to avoid compounding the memory pressure.
+ * available.
*/
if (current_is_kswapd()) {
ARCSTAT_BUMP(arcstat_memory_indirect_count);
} else {
- arc_no_grow = B_TRUE;
- arc_kmem_reap_soon();
ARCSTAT_BUMP(arcstat_memory_direct_count);
}
- return (pages);
+ return (sc->nr_to_scan);
}
SPL_SHRINKER_DECLARE(arc_shrinker,
@@ -305,9 +291,56 @@ arc_lowmem_init(void)
*/
spl_register_shrinker(&arc_shrinker);
- /* Set to 1/64 of all memory or a minimum of 512K */
- arc_sys_free = MAX(allmem / 64, (512 * 1024));
- arc_need_free = 0;
+ /*
+ * The ARC tries to keep at least this much memory available for the
+ * system. This gives the ARC time to shrink in response to memory
+ * pressure, before running completely out of memory and invoking the
+ * direct-reclaim ARC shrinker.
+ *
+ * This should be more than twice high_wmark_pages(), so that
+ * arc_wait_for_eviction() will wait until at least the
+ * high_wmark_pages() are free (see arc_evict_state_impl()).
+ *
+ * Note: Even when the system is very low on memory, the kernel's
+ * shrinker code may only ask for one "batch" of pages (512KB) to be
+ * evicted. If concurrent allocations consume these pages, there may
+ * still be insufficient free pages, and the OOM killer takes action.
+ *
+ * By setting arc_sys_free large enough, and having
+ * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
+ * free memory, it is much less likely that concurrent allocations can
+ * consume all the memory that was evicted before checking for
+ * OOM.
+ *
+ * It's hard to iterate the zones from a linux kernel module, which
+ * makes it difficult to determine the watermark dynamically. Instead
+ * we compute the maximum high watermark for this system, based
+ * on the amount of memory, assuming default parameters on Linux kernel
+ * 5.3.
+ */
+
+ /*
+ * Base wmark_low is 4 * the square root of Kbytes of RAM.
+ */
+ long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+
+ /*
+ * Clamp to between 128K and 64MB.
+ */
+ wmark = MAX(wmark, 128 * 1024);
+ wmark = MIN(wmark, 64 * 1024 * 1024);
+
+ /*
+ * watermark_boost can increase the wmark by up to 150%.
+ */
+ wmark += wmark * 150 / 100;
+
+ /*
+ * arc_sys_free needs to be more than 2x the watermark, because
+ * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up
+ * to 3x to ensure we're above it.
+ */
+ arc_sys_free = wmark * 3 + allmem / 32;
}
void
@@ -348,15 +381,11 @@ int64_t
arc_available_memory(void)
{
int64_t lowest = INT64_MAX;
- free_memory_reason_t r = FMR_UNKNOWN;
/* Every 100 calls, free a small amount */
if (spa_get_random(100) == 0)
lowest = -1024;
- last_free_memory = lowest;
- last_free_reason = r;
-
return (lowest);
}
@@ -429,3 +458,8 @@ arc_prune_async(int64_t adjust)
}
mutex_exit(&arc_prune_mtx);
}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
+ "Limit on number of pages that ARC shrinker can reclaim at once");
+/* END CSTYLED */