aboutsummaryrefslogtreecommitdiffstats
path: root/module/os/linux/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/os/linux/zfs')
-rw-r--r--module/os/linux/zfs/arc_os.c88
1 files changed, 47 insertions, 41 deletions
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 02dd80c06..75a9ea532 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -49,6 +49,7 @@
#include <linux/page_compat.h>
#include <linux/notifier.h>
#include <linux/memory.h>
+#include <linux/version.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -58,6 +59,7 @@
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
+#ifdef _KERNEL
/*
* This is a limit on how many pages the ARC shrinker makes available for
* eviction in response to one page allocation attempt. Note that in
@@ -72,11 +74,20 @@
* See also the comment in arc_shrinker_count().
* Set to 0 to disable limit.
*/
-int zfs_arc_shrinker_limit = 10000;
+static int zfs_arc_shrinker_limit = 10000;
+
+/*
+ * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted
+ * page. Bigger values make ARC more precious and evictions smaller comparing
+ * to other kernel subsystems. Value of 4 means parity with page cache,
+ * according to my reading of kernel's do_shrink_slab() and other code.
+ */
+static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS;
#ifdef CONFIG_MEMORY_HOTPLUG
static struct notifier_block arc_hotplug_callback_mem_nb;
#endif
+#endif
/*
* Return a default max arc size based on the amount of physical memory.
@@ -170,22 +181,7 @@ static unsigned long
arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
/*
- * __GFP_FS won't be set if we are called from ZFS code (see
- * kmem_flags_convert(), which removes it). To avoid a deadlock, we
- * don't allow evicting in this case. We return 0 rather than
- * SHRINK_STOP so that the shrinker logic doesn't accumulate a
- * deficit against us.
- */
- if (!(sc->gfp_mask & __GFP_FS)) {
- return (0);
- }
-
- /*
- * This code is reached in the "direct reclaim" case, where the
- * kernel (outside ZFS) is trying to allocate a page, and the system
- * is low on memory.
- *
- * The kernel's shrinker code doesn't understand how many pages the
+ * The kernel's shrinker code may not understand how many pages the
* ARC's callback actually frees, so it may ask the ARC to shrink a
* lot for one page allocation. This is problematic because it may
* take a long time, thus delaying the page allocation, and because
@@ -204,41 +200,45 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
*
* See also the comment above zfs_arc_shrinker_limit.
*/
+ int64_t can_free = btop(arc_evictable_memory());
int64_t limit = zfs_arc_shrinker_limit != 0 ?
zfs_arc_shrinker_limit : INT64_MAX;
- return (MIN(limit, btop((int64_t)arc_evictable_memory())));
+ return (MIN(can_free, limit));
}
static unsigned long
arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- ASSERT((sc->gfp_mask & __GFP_FS) != 0);
-
/* The arc is considered warm once reclaim has occurred */
if (unlikely(arc_warm == B_FALSE))
arc_warm = B_TRUE;
/*
+ * We are experiencing memory pressure which the arc_evict_zthr was
+ * unable to keep up with. Set arc_no_grow to briefly pause ARC
+ * growth to avoid compounding the memory pressure.
+ */
+ arc_no_grow = B_TRUE;
+
+ /*
* Evict the requested number of pages by reducing arc_c and waiting
- * for the requested amount of data to be evicted.
+ * for the requested amount of data to be evicted. To avoid deadlock
+ * do not wait for eviction if we may be called from ZFS itself (see
+ * kmem_flags_convert() removing __GFP_FS). It may cause excessive
+ * eviction later if many evictions are accumulated, but just skipping
+ * the eviction is not good either if most of memory is used by ARC.
*/
- arc_reduce_target_size(ptob(sc->nr_to_scan));
- arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
+ uint64_t to_free = arc_reduce_target_size(ptob(sc->nr_to_scan));
+ if (sc->gfp_mask & __GFP_FS)
+ arc_wait_for_eviction(to_free, B_FALSE, B_FALSE);
if (current->reclaim_state != NULL)
#ifdef HAVE_RECLAIM_STATE_RECLAIMED
- current->reclaim_state->reclaimed += sc->nr_to_scan;
+ current->reclaim_state->reclaimed += btop(to_free);
#else
- current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
+ current->reclaim_state->reclaimed_slab += btop(to_free);
#endif
/*
- * We are experiencing memory pressure which the arc_evict_zthr was
- * unable to keep up with. Set arc_no_grow to briefly pause arc
- * growth to avoid compounding the memory pressure.
- */
- arc_no_grow = B_TRUE;
-
- /*
* When direct reclaim is observed it usually indicates a rapid
* increase in memory pressure. This occurs because the kswapd
* threads were unable to asynchronously keep enough free memory
@@ -250,7 +250,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
ARCSTAT_BUMP(arcstat_memory_direct_count);
}
- return (sc->nr_to_scan);
+ return (btop(to_free));
}
static struct shrinker *arc_shrinker = NULL;
@@ -304,9 +304,7 @@ arc_set_sys_free(uint64_t allmem)
* arc_wait_for_eviction() will wait until at least the
* high_wmark_pages() are free (see arc_evict_state_impl()).
*
- * Note: Even when the system is very low on memory, the kernel's
- * shrinker code may only ask for one "batch" of pages (512KB) to be
- * evicted. If concurrent allocations consume these pages, there may
+ * Note: If concurrent allocations consume these pages, there may
* still be insufficient free pages, and the OOM killer takes action.
*
* By setting arc_sys_free large enough, and having
@@ -318,20 +316,26 @@ arc_set_sys_free(uint64_t allmem)
* It's hard to iterate the zones from a linux kernel module, which
* makes it difficult to determine the watermark dynamically. Instead
* we compute the maximum high watermark for this system, based
- * on the amount of memory, assuming default parameters on Linux kernel
- * 5.3.
+ * on the amount of memory, using the same method as the kernel uses
+ * to calculate its internal `min_free_kbytes` variable. See
+ * torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value
+ * from 64M to 256M.
*/
/*
* Base wmark_low is 4 * the square root of Kbytes of RAM.
*/
- long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+ long wmark = int_sqrt(allmem / 1024 * 16) * 1024;
/*
- * Clamp to between 128K and 64MB.
+ * Clamp to between 128K and 256/64MB.
*/
wmark = MAX(wmark, 128 * 1024);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)
+ wmark = MIN(wmark, 256 * 1024 * 1024);
+#else
wmark = MIN(wmark, 64 * 1024 * 1024);
+#endif
/*
* watermark_boost can increase the wmark by up to 150%.
@@ -357,7 +361,7 @@ arc_lowmem_init(void)
* swapping out pages when it is preferable to shrink the arc.
*/
arc_shrinker = spl_register_shrinker("zfs-arc-shrinker",
- arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+ arc_shrinker_count, arc_shrinker_scan, zfs_arc_shrinker_seeks);
VERIFY(arc_shrinker);
arc_set_sys_free(allmem);
@@ -500,3 +504,5 @@ arc_unregister_hotplug(void)
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
"Limit on number of pages that ARC shrinker can reclaim at once");
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_seeks, INT, ZMOD_RD,
+ "Relative cost of ARC eviction vs other kernel subsystems");