diff options
Diffstat (limited to 'module/os/linux/zfs')
-rw-r--r-- | module/os/linux/zfs/arc_os.c | 88 |
1 files changed, 47 insertions, 41 deletions
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 02dd80c06..75a9ea532 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -49,6 +49,7 @@ #include <linux/page_compat.h> #include <linux/notifier.h> #include <linux/memory.h> +#include <linux/version.h> #endif #include <sys/callb.h> #include <sys/kstat.h> @@ -58,6 +59,7 @@ #include <sys/trace_zfs.h> #include <sys/aggsum.h> +#ifdef _KERNEL /* * This is a limit on how many pages the ARC shrinker makes available for * eviction in response to one page allocation attempt. Note that in @@ -72,11 +74,20 @@ * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ -int zfs_arc_shrinker_limit = 10000; +static int zfs_arc_shrinker_limit = 10000; + +/* + * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted + * page. Bigger values make ARC more precious and evictions smaller comparing + * to other kernel subsystems. Value of 4 means parity with page cache, + * according to my reading of kernel's do_shrink_slab() and other code. + */ +static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS; #ifdef CONFIG_MEMORY_HOTPLUG static struct notifier_block arc_hotplug_callback_mem_nb; #endif +#endif /* * Return a default max arc size based on the amount of physical memory. @@ -170,22 +181,7 @@ static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { /* - * __GFP_FS won't be set if we are called from ZFS code (see - * kmem_flags_convert(), which removes it). To avoid a deadlock, we - * don't allow evicting in this case. We return 0 rather than - * SHRINK_STOP so that the shrinker logic doesn't accumulate a - * deficit against us. - */ - if (!(sc->gfp_mask & __GFP_FS)) { - return (0); - } - - /* - * This code is reached in the "direct reclaim" case, where the - * kernel (outside ZFS) is trying to allocate a page, and the system - * is low on memory. - * - * The kernel's shrinker code doesn't understand how many pages the + * The kernel's shrinker code may not understand how many pages the * ARC's callback actually frees, so it may ask the ARC to shrink a * lot for one page allocation. This is problematic because it may * take a long time, thus delaying the page allocation, and because @@ -204,41 +200,45 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) * * See also the comment above zfs_arc_shrinker_limit. */ + int64_t can_free = btop(arc_evictable_memory()); int64_t limit = zfs_arc_shrinker_limit != 0 ? zfs_arc_shrinker_limit : INT64_MAX; - return (MIN(limit, btop((int64_t)arc_evictable_memory()))); + return (MIN(can_free, limit)); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - ASSERT((sc->gfp_mask & __GFP_FS) != 0); - /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; /* + * We are experiencing memory pressure which the arc_evict_zthr was + * unable to keep up with. Set arc_no_grow to briefly pause ARC + * growth to avoid compounding the memory pressure. + */ + arc_no_grow = B_TRUE; + + /* * Evict the requested number of pages by reducing arc_c and waiting - * for the requested amount of data to be evicted. + * for the requested amount of data to be evicted. To avoid deadlock + * do not wait for eviction if we may be called from ZFS itself (see + * kmem_flags_convert() removing __GFP_FS). It may cause excessive + * eviction later if many evictions are accumulated, but just skipping + * the eviction is not good either if most of memory is used by ARC. */ - arc_reduce_target_size(ptob(sc->nr_to_scan)); - arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); + uint64_t to_free = arc_reduce_target_size(ptob(sc->nr_to_scan)); + if (sc->gfp_mask & __GFP_FS) + arc_wait_for_eviction(to_free, B_FALSE, B_FALSE); if (current->reclaim_state != NULL) #ifdef HAVE_RECLAIM_STATE_RECLAIMED - current->reclaim_state->reclaimed += sc->nr_to_scan; + current->reclaim_state->reclaimed += btop(to_free); #else - current->reclaim_state->reclaimed_slab += sc->nr_to_scan; + current->reclaim_state->reclaimed_slab += btop(to_free); #endif /* - * We are experiencing memory pressure which the arc_evict_zthr was - * unable to keep up with. Set arc_no_grow to briefly pause arc - * growth to avoid compounding the memory pressure. - */ - arc_no_grow = B_TRUE; - - /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd * threads were unable to asynchronously keep enough free memory @@ -250,7 +250,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_direct_count); } - return (sc->nr_to_scan); + return (btop(to_free)); } static struct shrinker *arc_shrinker = NULL; @@ -304,9 +304,7 @@ arc_set_sys_free(uint64_t allmem) * arc_wait_for_eviction() will wait until at least the * high_wmark_pages() are free (see arc_evict_state_impl()). * - * Note: Even when the system is very low on memory, the kernel's - * shrinker code may only ask for one "batch" of pages (512KB) to be - * evicted. If concurrent allocations consume these pages, there may + * Note: If concurrent allocations consume these pages, there may * still be insufficient free pages, and the OOM killer takes action. * * By setting arc_sys_free large enough, and having @@ -318,20 +316,26 @@ arc_set_sys_free(uint64_t allmem) * It's hard to iterate the zones from a linux kernel module, which * makes it difficult to determine the watermark dynamically. Instead * we compute the maximum high watermark for this system, based - * on the amount of memory, assuming default parameters on Linux kernel - * 5.3. + * on the amount of memory, using the same method as the kernel uses + * to calculate its internal `min_free_kbytes` variable. See + * torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value + * from 64M to 256M. */ /* * Base wmark_low is 4 * the square root of Kbytes of RAM. */ - long wmark = 4 * int_sqrt(allmem/1024) * 1024; + long wmark = int_sqrt(allmem / 1024 * 16) * 1024; /* - * Clamp to between 128K and 64MB. + * Clamp to between 128K and 256/64MB. */ wmark = MAX(wmark, 128 * 1024); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) + wmark = MIN(wmark, 256 * 1024 * 1024); +#else wmark = MIN(wmark, 64 * 1024 * 1024); +#endif /* * watermark_boost can increase the wmark by up to 150%. @@ -357,7 +361,7 @@ arc_lowmem_init(void) * swapping out pages when it is preferable to shrink the arc. */ arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + arc_shrinker_count, arc_shrinker_scan, zfs_arc_shrinker_seeks); VERIFY(arc_shrinker); arc_set_sys_free(allmem); @@ -500,3 +504,5 @@ arc_unregister_hotplug(void) ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_seeks, INT, ZMOD_RD, + "Relative cost of ARC eviction vs other kernel subsystems"); |