diff options
Diffstat (limited to 'module/os')
-rw-r--r-- | module/os/freebsd/zfs/arc_os.c | 19 | ||||
-rw-r--r-- | module/os/linux/zfs/arc_os.c | 216 |
2 files changed, 129 insertions, 106 deletions
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 554896d85..5f4b5df4a 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -52,9 +52,6 @@ extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; -int64_t last_free_memory; -free_memory_reason_t last_free_reason; - static void arc_free_target_init(void *unused __unused) { @@ -100,7 +97,6 @@ arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n __unused; - free_memory_reason_t r = FMR_UNKNOWN; /* * Cooperate with pagedaemon when it's time for it to scan @@ -109,7 +105,6 @@ arc_available_memory(void) n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; - r = FMR_LOTSFREE; } #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* @@ -126,13 +121,10 @@ arc_available_memory(void) n = uma_avail() - (long)(uma_limit() / 4); if (n < lowest) { lowest = n; - r = FMR_HEAP_ARENA; } #endif - last_free_memory = lowest; - last_free_reason = r; - DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + DTRACE_PROBE1(arc__available_memory, int64_t, lowest); return (lowest); } @@ -223,18 +215,15 @@ arc_lowmem(void *arg __unused, int howto __unused) DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) - (void) cv_wait(&arc_evict_waiters_cv, &arc_evict_lock); - mutex_exit(&arc_evict_lock); + arc_wait_for_eviction(to_free); + else + arc_wait_for_eviction(0); } void diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 9c3a6a4e2..92f9bae8c 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -57,8 +57,22 @@ #include <sys/trace_zfs.h> #include <sys/aggsum.h> -int64_t last_free_memory; -free_memory_reason_t last_free_reason; +/* + * This is a limit on how many pages the ARC shrinker makes available for + * eviction in response to one page allocation attempt. Note that in + * practice, the kernel's shrinker can ask us to evict up to about 4x this + * for one allocation attempt. + * + * The default limit of 10,000 (in practice, 160MB per allocation attempt + * with 4K pages) limits the amount of time spent attempting to reclaim ARC + * memory to less than 100ms per allocation attempt, even with a small + * average compressed block size of ~8KB. + * + * See also the comment in arc_shrinker_count(). + * Set to 0 to disable limit. + */ +int zfs_arc_shrinker_limit = 10000; + /* * Return a default max arc size based on the amount of physical memory. @@ -105,16 +119,6 @@ arc_free_memory(void) } /* - * Additional reserve of pages for pp_reserve. - */ -int64_t arc_pages_pp_reserve = 64; - -/* - * Additional reserve of pages for swapfs. - */ -int64_t arc_swapfs_reserve = 64; - -/* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. @@ -122,25 +126,7 @@ int64_t arc_swapfs_reserve = 64; int64_t arc_available_memory(void) { - int64_t lowest = INT64_MAX; - free_memory_reason_t r = FMR_UNKNOWN; - int64_t n; - - if (arc_need_free > 0) { - lowest = -arc_need_free; - r = FMR_NEEDFREE; - } - - n = arc_free_memory() - arc_sys_free - arc_need_free; - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - - last_free_memory = lowest; - last_free_reason = r; - - return (lowest); + return (arc_free_memory() - arc_sys_free); } static uint64_t @@ -174,84 +160,84 @@ arc_evictable_memory(void) static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - return (btop((int64_t)arc_evictable_memory())); + /* + * __GFP_FS won't be set if we are called from ZFS code (see + * kmem_flags_convert(), which removes it). To avoid a deadlock, we + * don't allow evicting in this case. We return 0 rather than + * SHRINK_STOP so that the shrinker logic doesn't accumulate a + * deficit against us. + */ + if (!(sc->gfp_mask & __GFP_FS)) { + return (0); + } + + /* + * This code is reached in the "direct reclaim" case, where the + * kernel (outside ZFS) is trying to allocate a page, and the system + * is low on memory. + * + * The kernel's shrinker code doesn't understand how many pages the + * ARC's callback actually frees, so it may ask the ARC to shrink a + * lot for one page allocation. This is problematic because it may + * take a long time, thus delaying the page allocation, and because + * it may force the ARC to unnecessarily shrink very small. + * + * Therefore, we limit the amount of data that we say is evictable, + * which limits the amount that the shrinker will ask us to evict for + * one page allocation attempt. + * + * In practice, we may be asked to shrink 4x the limit to satisfy one + * page allocation, before the kernel's shrinker code gives up on us. + * When that happens, we rely on the kernel code to find the pages + * that we freed before invoking the OOM killer. This happens in + * __alloc_pages_slowpath(), which retries and finds the pages we + * freed when it calls get_page_from_freelist(). + * + * See also the comment above zfs_arc_shrinker_limit. + */ + int64_t limit = zfs_arc_shrinker_limit != 0 ? + zfs_arc_shrinker_limit : INT64_MAX; + return (MIN(limit, btop((int64_t)arc_evictable_memory()))); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - int64_t pages; + ASSERT((sc->gfp_mask & __GFP_FS) != 0); /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; - /* Return the potential number of reclaimable pages */ - pages = btop((int64_t)arc_evictable_memory()); - - /* Not allowed to perform filesystem reclaim */ - if (!(sc->gfp_mask & __GFP_FS)) - return (SHRINK_STOP); - - /* Reclaim in progress */ - if (mutex_tryenter(&arc_evict_lock) == 0) { - ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan)); - return (0); - } - - mutex_exit(&arc_evict_lock); + /* + * Evict the requested number of pages by reducing arc_c and waiting + * for the requested amount of data to be evicted. + */ + arc_reduce_target_size(ptob(sc->nr_to_scan)); + arc_wait_for_eviction(ptob(sc->nr_to_scan)); + if (current->reclaim_state != NULL) + current->reclaim_state->reclaimed_slab += sc->nr_to_scan; /* - * Evict the requested number of pages by shrinking arc_c the - * requested amount. + * We are experiencing memory pressure which the arc_evict_zthr was + * unable to keep up with. Set arc_no_grow to briefly pause arc + * growth to avoid compounding the memory pressure. */ - if (pages > 0) { - arc_reduce_target_size(ptob(sc->nr_to_scan)); - - /* - * Repeated calls to the arc shrinker can reduce arc_c - * drastically, potentially all the way to arc_c_min. While - * arc_c is below arc_size, ZFS can't process read/write - * requests, because arc_get_data_impl() will block. To - * ensure that arc_c doesn't shrink faster than the evict - * thread can keep up, we wait for eviction here. - */ - mutex_enter(&arc_evict_lock); - if (arc_is_overflowing()) { - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - (void) cv_wait(&arc_evict_waiters_cv, - &arc_evict_lock); - } - mutex_exit(&arc_evict_lock); - - if (current_is_kswapd()) - arc_kmem_reap_soon(); - pages = MAX((int64_t)pages - - (int64_t)btop(arc_evictable_memory()), 0); - /* - * We've shrunk what we can, wake up threads. - */ - cv_broadcast(&arc_evict_waiters_cv); - } else - pages = SHRINK_STOP; + arc_no_grow = B_TRUE; /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd * threads were unable to asynchronously keep enough free memory - * available. In this case set arc_no_grow to briefly pause arc - * growth to avoid compounding the memory pressure. + * available. */ if (current_is_kswapd()) { ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { - arc_no_grow = B_TRUE; - arc_kmem_reap_soon(); ARCSTAT_BUMP(arcstat_memory_direct_count); } - return (pages); + return (sc->nr_to_scan); } SPL_SHRINKER_DECLARE(arc_shrinker, @@ -305,9 +291,56 @@ arc_lowmem_init(void) */ spl_register_shrinker(&arc_shrinker); - /* Set to 1/64 of all memory or a minimum of 512K */ - arc_sys_free = MAX(allmem / 64, (512 * 1024)); - arc_need_free = 0; + /* + * The ARC tries to keep at least this much memory available for the + * system. This gives the ARC time to shrink in response to memory + * pressure, before running completely out of memory and invoking the + * direct-reclaim ARC shrinker. + * + * This should be more than twice high_wmark_pages(), so that + * arc_wait_for_eviction() will wait until at least the + * high_wmark_pages() are free (see arc_evict_state_impl()). + * + * Note: Even when the system is very low on memory, the kernel's + * shrinker code may only ask for one "batch" of pages (512KB) to be + * evicted. If concurrent allocations consume these pages, there may + * still be insufficient free pages, and the OOM killer takes action. + * + * By setting arc_sys_free large enough, and having + * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 + * free memory, it is much less likely that concurrent allocations can + * consume all the memory that was evicted before checking for + * OOM. + * + * It's hard to iterate the zones from a linux kernel module, which + * makes it difficult to determine the watermark dynamically. Instead + * we compute the maximum high watermark for this system, based + * on the amount of memory, assuming default parameters on Linux kernel + * 5.3. + */ + + /* + * Base wmark_low is 4 * the square root of Kbytes of RAM. + */ + long wmark = 4 * int_sqrt(allmem/1024) * 1024; + + /* + * Clamp to between 128K and 64MB. + */ + wmark = MAX(wmark, 128 * 1024); + wmark = MIN(wmark, 64 * 1024 * 1024); + + /* + * watermark_boost can increase the wmark by up to 150%. + */ + wmark += wmark * 150 / 100; + + /* + * arc_sys_free needs to be more than 2x the watermark, because + * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up + * to 3x to ensure we're above it. + */ + arc_sys_free = wmark * 3 + allmem / 32; } void @@ -348,15 +381,11 @@ int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; - free_memory_reason_t r = FMR_UNKNOWN; /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; - last_free_memory = lowest; - last_free_reason = r; - return (lowest); } @@ -429,3 +458,8 @@ arc_prune_async(int64_t adjust) } mutex_exit(&arc_prune_mtx); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, + "Limit on number of pages that ARC shrinker can reclaim at once"); +/* END CSTYLED */ |