7 files changed, 387 insertions, 200 deletions
diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h
index e519a527c..cc34d8ab1 100644
--- a/include/os/linux/spl/sys/shrinker.h
+++ b/include/os/linux/spl/sys/shrinker.h
@@ -84,7 +84,7 @@ __ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\
 									\
 static struct shrinker varname = {					\
 	.shrink = __ ## varname ## _wrapper,				\
-	.seeks = seek_cost						\
+	.seeks = seek_cost,						\
 }
 
 #define	SHRINK_STOP	(-1)
@@ -97,7 +97,7 @@ static struct shrinker varname = {					\
 static struct shrinker varname = {					\
 	.count_objects = countfunc,					\
 	.scan_objects = scanfunc,					\
-	.seeks = seek_cost						\
+	.seeks = seek_cost,						\
 }
 
 #else
diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h
index 5ce5b38a3..faf2bd3d5 100644
--- a/include/os/linux/zfs/sys/trace_arc.h
+++ b/include/os/linux/zfs/sys/trace_arc.h
@@ -354,6 +354,41 @@ DEFINE_EVENT(zfs_l2arc_evict_class, name, \
 /* END CSTYLED */
 DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict);
 
+/*
+ * Generic support for three argument tracepoints of the form:
+ *
+ * DTRACE_PROBE3(...,
+ *     uint64_t, ...,
+ *     uint64_t, ...,
+ *     uint64_t, ...);
+ */
+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class,
+	TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
+	TP_ARGS(amount, arc_evict_count, aew_count),
+	TP_STRUCT__entry(
+	    __field(uint64_t,		amount)
+	    __field(uint64_t,		arc_evict_count)
+	    __field(uint64_t,		aew_count)
+	),
+	TP_fast_assign(
+	    __entry->amount		= amount;
+	    __entry->arc_evict_count	= arc_evict_count;
+	    __entry->aew_count		= aew_count;
+	),
+	TP_printk("amount %llu arc_evict_count %llu aew_count %llu",
+	    __entry->amount, __entry->arc_evict_count, __entry->aew_count)
+);
+/* END CSTYLED */
+
+/* BEGIN CSTYLED */
+#define	DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \
+DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \
+	TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
+	TP_ARGS(amount, arc_evict_count, aew_count),
+/* END CSTYLED */
+DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction);
+
 #endif /* _TRACE_ARC_H */
 
 #undef TRACE_INCLUDE_PATH
@@ -376,6 +411,7 @@ DEFINE_DTRACE_PROBE1(l2arc__miss);
 DEFINE_DTRACE_PROBE2(l2arc__read);
 DEFINE_DTRACE_PROBE2(l2arc__write);
 DEFINE_DTRACE_PROBE2(l2arc__iodone);
+DEFINE_DTRACE_PROBE3(arc__wait__for__eviction);
 DEFINE_DTRACE_PROBE4(arc__miss);
 DEFINE_DTRACE_PROBE4(l2arc__evict);
 
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index eb90d5bc9..bb9163ba7 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -846,15 +846,11 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_cached_only_in_progress;
 } arc_stats_t;
 
-typedef enum free_memory_reason_t {
-	FMR_UNKNOWN,
-	FMR_NEEDFREE,
-	FMR_LOTSFREE,
-	FMR_SWAPFS_MINFREE,
-	FMR_PAGES_PP_MAXIMUM,
-	FMR_HEAP_ARENA,
-	FMR_ZIO_ARENA,
-} free_memory_reason_t;
+typedef struct arc_evict_waiter {
+	list_node_t aew_node;
+	kcondvar_t aew_cv;
+	uint64_t aew_count;
+} arc_evict_waiter_t;
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
 
@@ -870,7 +866,6 @@ typedef enum free_memory_reason_t {
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
 #define	arc_sys_free	ARCSTAT(arcstat_sys_free) /* target system free bytes */
-#define	arc_need_free	ARCSTAT(arcstat_need_free) /* bytes to be freed */
 
 extern taskq_t *arc_prune_taskq;
 extern arc_stats_t arc_stats;
@@ -879,10 +874,6 @@ extern boolean_t arc_warm;
 extern int arc_grow_retry;
 extern int arc_no_grow_shift;
 extern int arc_shrink_shift;
-extern zthr_t		*arc_evict_zthr;
-extern kmutex_t		arc_evict_lock;
-extern kcondvar_t	arc_evict_waiters_cv;
-extern boolean_t	arc_evict_needed;
 extern kmutex_t arc_prune_mtx;
 extern list_t arc_prune_list;
 extern aggsum_t arc_size;
@@ -897,6 +888,7 @@ extern void arc_reduce_target_size(int64_t to_free);
 extern boolean_t arc_reclaim_needed(void);
 extern void arc_kmem_reap_soon(void);
 extern boolean_t arc_is_overflowing(void);
+extern void arc_wait_for_eviction(uint64_t);
 
 extern void arc_lowmem_init(void);
 extern void arc_lowmem_fini(void);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index c2abd9d80..c209acbe1 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -864,6 +864,23 @@ Default value: \fB8192\fR.
 .sp
 .ne 2
 .na
+\fBzfs_arc_eviction_pct\fR (int)
+.ad
+.RS 12n
+When \fBarc_is_overflowing()\fR, \fBarc_get_data_impl()\fR waits for this
+percent of the requested amount of data to be evicted.  For example, by
+default for every 2KB that's evicted, 1KB of it may be "reused" by a new
+allocation. Since this is above 100%, it ensures that progress is made
+towards getting \fBarc_size\fR under \fBarc_c\fR.  Since this is finite, it
+ensures that allocations can still happen, even during the potentially long
+time that \fBarc_size\fR is more than \fBarc_c\fR.
+.sp
+Default value: \fB200\fR.
+.RE
+
+.sp
+.ne 2
+.na
 \fBzfs_arc_evict_batch_limit\fR (int)
 .ad
 .RS 12n
@@ -1151,6 +1168,29 @@ Default value: \fB0\fR% (disabled).
 .sp
 .ne 2
 .na
+\fBzfs_arc_shrinker_limit\fR (int)
+.ad
+.RS 12n
+This is a limit on how many pages the ARC shrinker makes available for
+eviction in response to one page allocation attempt.  Note that in
+practice, the kernel's shrinker can ask us to evict up to about 4x this
+for one allocation attempt.
+.sp
+The default limit of 10,000 (in practice, 160MB per allocation attempt with
+4K pages) limits the amount of time spent attempting to reclaim ARC memory to
+less than 100ms per allocation attempt, even with a small average compressed
+block size of ~8KB.
+.sp
+The parameter can be set to 0 (zero) to disable the limit.
+.sp
+This parameter only applies on Linux.
+.sp
+Default value: \fB10,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
 \fBzfs_arc_sys_free\fR (ulong)
 .ad
 .RS 12n
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 554896d85..5f4b5df4a 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -52,9 +52,6 @@ extern struct vfsops zfs_vfsops;
 
 uint_t zfs_arc_free_target = 0;
 
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
-
 static void
 arc_free_target_init(void *unused __unused)
 {
@@ -100,7 +97,6 @@ arc_available_memory(void)
 {
 	int64_t lowest = INT64_MAX;
 	int64_t n __unused;
-	free_memory_reason_t r = FMR_UNKNOWN;
 
 	/*
 	 * Cooperate with pagedaemon when it's time for it to scan
@@ -109,7 +105,6 @@ arc_available_memory(void)
 	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
 	if (n < lowest) {
 		lowest = n;
-		r = FMR_LOTSFREE;
 	}
 #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	/*
@@ -126,13 +121,10 @@ arc_available_memory(void)
 	n = uma_avail() - (long)(uma_limit() / 4);
 	if (n < lowest) {
 		lowest = n;
-		r = FMR_HEAP_ARENA;
 	}
 #endif
 
-	last_free_memory = lowest;
-	last_free_reason = r;
-	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+	DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
 	return (lowest);
 }
 
@@ -223,18 +215,15 @@ arc_lowmem(void *arg __unused, int howto __unused)
 	DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
 	arc_reduce_target_size(to_free);
 
-	mutex_enter(&arc_evict_lock);
-	arc_evict_needed = B_TRUE;
-	zthr_wakeup(arc_evict_zthr);
-
 	/*
 	 * It is unsafe to block here in arbitrary threads, because we can come
 	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
 	 * with ARC reclaim thread.
 	 */
 	if (curproc == pageproc)
-		(void) cv_wait(&arc_evict_waiters_cv, &arc_evict_lock);
-	mutex_exit(&arc_evict_lock);
+		arc_wait_for_eviction(to_free);
+	else
+		arc_wait_for_eviction(0);
 }
 
 void
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 9c3a6a4e2..92f9bae8c 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -57,8 +57,22 @@
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
+/*
+ * This is a limit on how many pages the ARC shrinker makes available for
+ * eviction in response to one page allocation attempt.  Note that in
+ * practice, the kernel's shrinker can ask us to evict up to about 4x this
+ * for one allocation attempt.
+ *
+ * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * with 4K pages) limits the amount of time spent attempting to reclaim ARC
+ * memory to less than 100ms per allocation attempt, even with a small
+ * average compressed block size of ~8KB.
+ *
+ * See also the comment in arc_shrinker_count().
+ * Set to 0 to disable limit.
+ */
+int zfs_arc_shrinker_limit = 10000;
+
 
 /*
  * Return a default max arc size based on the amount of physical memory.
@@ -105,16 +119,6 @@ arc_free_memory(void)
 }
 
 /*
- * Additional reserve of pages for pp_reserve.
- */
-int64_t arc_pages_pp_reserve = 64;
-
-/*
- * Additional reserve of pages for swapfs.
- */
-int64_t arc_swapfs_reserve = 64;
-
-/*
  * Return the amount of memory that can be consumed before reclaim will be
  * needed.  Positive if there is sufficient free memory, negative indicates
  * the amount of memory that needs to be freed up.
@@ -122,25 +126,7 @@ int64_t arc_swapfs_reserve = 64;
 int64_t
 arc_available_memory(void)
 {
-	int64_t lowest = INT64_MAX;
-	free_memory_reason_t r = FMR_UNKNOWN;
-	int64_t n;
-
-	if (arc_need_free > 0) {
-		lowest = -arc_need_free;
-		r = FMR_NEEDFREE;
-	}
-
-	n = arc_free_memory() - arc_sys_free - arc_need_free;
-	if (n < lowest) {
-		lowest = n;
-		r = FMR_LOTSFREE;
-	}
-
-	last_free_memory = lowest;
-	last_free_reason = r;
-
-	return (lowest);
+	return (arc_free_memory() - arc_sys_free);
 }
 
 static uint64_t
@@ -174,84 +160,84 @@ arc_evictable_memory(void)
 static unsigned long
 arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	return (btop((int64_t)arc_evictable_memory()));
+	/*
+	 * __GFP_FS won't be set if we are called from ZFS code (see
+	 * kmem_flags_convert(), which removes it).  To avoid a deadlock, we
+	 * don't allow evicting in this case.  We return 0 rather than
+	 * SHRINK_STOP so that the shrinker logic doesn't accumulate a
+	 * deficit against us.
+	 */
+	if (!(sc->gfp_mask & __GFP_FS)) {
+		return (0);
+	}
+
+	/*
+	 * This code is reached in the "direct reclaim" case, where the
+	 * kernel (outside ZFS) is trying to allocate a page, and the system
+	 * is low on memory.
+	 *
+	 * The kernel's shrinker code doesn't understand how many pages the
+	 * ARC's callback actually frees, so it may ask the ARC to shrink a
+	 * lot for one page allocation. This is problematic because it may
+	 * take a long time, thus delaying the page allocation, and because
+	 * it may force the ARC to unnecessarily shrink very small.
+	 *
+	 * Therefore, we limit the amount of data that we say is evictable,
+	 * which limits the amount that the shrinker will ask us to evict for
+	 * one page allocation attempt.
+	 *
+	 * In practice, we may be asked to shrink 4x the limit to satisfy one
+	 * page allocation, before the kernel's shrinker code gives up on us.
+	 * When that happens, we rely on the kernel code to find the pages
+	 * that we freed before invoking the OOM killer.  This happens in
+	 * __alloc_pages_slowpath(), which retries and finds the pages we
+	 * freed when it calls get_page_from_freelist().
+	 *
+	 * See also the comment above zfs_arc_shrinker_limit.
+	 */
+	int64_t limit = zfs_arc_shrinker_limit != 0 ?
+	    zfs_arc_shrinker_limit : INT64_MAX;
+	return (MIN(limit, btop((int64_t)arc_evictable_memory())));
 }
 
 static unsigned long
 arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	int64_t pages;
+	ASSERT((sc->gfp_mask & __GFP_FS) != 0);
 
 	/* The arc is considered warm once reclaim has occurred */
 	if (unlikely(arc_warm == B_FALSE))
 		arc_warm = B_TRUE;
 
-	/* Return the potential number of reclaimable pages */
-	pages = btop((int64_t)arc_evictable_memory());
-
-	/* Not allowed to perform filesystem reclaim */
-	if (!(sc->gfp_mask & __GFP_FS))
-		return (SHRINK_STOP);
-
-	/* Reclaim in progress */
-	if (mutex_tryenter(&arc_evict_lock) == 0) {
-		ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan));
-		return (0);
-	}
-
-	mutex_exit(&arc_evict_lock);
+	/*
+	 * Evict the requested number of pages by reducing arc_c and waiting
+	 * for the requested amount of data to be evicted.
+	 */
+	arc_reduce_target_size(ptob(sc->nr_to_scan));
+	arc_wait_for_eviction(ptob(sc->nr_to_scan));
+	if (current->reclaim_state != NULL)
+		current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
 
 	/*
-	 * Evict the requested number of pages by shrinking arc_c the
-	 * requested amount.
+	 * We are experiencing memory pressure which the arc_evict_zthr was
+	 * unable to keep up with. Set arc_no_grow to briefly pause arc
+	 * growth to avoid compounding the memory pressure.
 	 */
-	if (pages > 0) {
-		arc_reduce_target_size(ptob(sc->nr_to_scan));
-
-		/*
-		 * Repeated calls to the arc shrinker can reduce arc_c
-		 * drastically, potentially all the way to arc_c_min.  While
-		 * arc_c is below arc_size, ZFS can't process read/write
-		 * requests, because arc_get_data_impl() will block.  To
-		 * ensure that arc_c doesn't shrink faster than the evict
-		 * thread can keep up, we wait for eviction here.
-		 */
-		mutex_enter(&arc_evict_lock);
-		if (arc_is_overflowing()) {
-			arc_evict_needed = B_TRUE;
-			zthr_wakeup(arc_evict_zthr);
-			(void) cv_wait(&arc_evict_waiters_cv,
-			    &arc_evict_lock);
-		}
-		mutex_exit(&arc_evict_lock);
-
-		if (current_is_kswapd())
-			arc_kmem_reap_soon();
-		pages = MAX((int64_t)pages -
-		    (int64_t)btop(arc_evictable_memory()), 0);
-		/*
-		 * We've shrunk what we can, wake up threads.
-		 */
-		cv_broadcast(&arc_evict_waiters_cv);
-	} else
-		pages = SHRINK_STOP;
+	arc_no_grow = B_TRUE;
 
 	/*
 	 * When direct reclaim is observed it usually indicates a rapid
 	 * increase in memory pressure.  This occurs because the kswapd
 	 * threads were unable to asynchronously keep enough free memory
-	 * available.  In this case set arc_no_grow to briefly pause arc
-	 * growth to avoid compounding the memory pressure.
+	 * available.
 	 */
 	if (current_is_kswapd()) {
 		ARCSTAT_BUMP(arcstat_memory_indirect_count);
 	} else {
-		arc_no_grow = B_TRUE;
-		arc_kmem_reap_soon();
 		ARCSTAT_BUMP(arcstat_memory_direct_count);
 	}
 
-	return (pages);
+	return (sc->nr_to_scan);
 }
 
 SPL_SHRINKER_DECLARE(arc_shrinker,
@@ -305,9 +291,56 @@ arc_lowmem_init(void)
 	 */
 	spl_register_shrinker(&arc_shrinker);
 
-	/* Set to 1/64 of all memory or a minimum of 512K */
-	arc_sys_free = MAX(allmem / 64, (512 * 1024));
-	arc_need_free = 0;
+	/*
+	 * The ARC tries to keep at least this much memory available for the
+	 * system.  This gives the ARC time to shrink in response to memory
+	 * pressure, before running completely out of memory and invoking the
+	 * direct-reclaim ARC shrinker.
+	 *
+	 * This should be more than twice high_wmark_pages(), so that
+	 * arc_wait_for_eviction() will wait until at least the
+	 * high_wmark_pages() are free (see arc_evict_state_impl()).
+	 *
+	 * Note: Even when the system is very low on memory, the kernel's
+	 * shrinker code may only ask for one "batch" of pages (512KB) to be
+	 * evicted.  If concurrent allocations consume these pages, there may
+	 * still be insufficient free pages, and the OOM killer takes action.
+	 *
+	 * By setting arc_sys_free large enough, and having
+	 * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
+	 * free memory, it is much less likely that concurrent allocations can
+	 * consume all the memory that was evicted before checking for
+	 * OOM.
+	 *
+	 * It's hard to iterate the zones from a linux kernel module, which
+	 * makes it difficult to determine the watermark dynamically. Instead
+	 * we compute the maximum high watermark for this system, based
+	 * on the amount of memory, assuming default parameters on Linux kernel
+	 * 5.3.
+	 */
+
+	/*
+	 * Base wmark_low is 4 * the square root of Kbytes of RAM.
+	 */
+	long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+
+	/*
+	 * Clamp to between 128K and 64MB.
+	 */
+	wmark = MAX(wmark, 128 * 1024);
+	wmark = MIN(wmark, 64 * 1024 * 1024);
+
+	/*
+	 * watermark_boost can increase the wmark by up to 150%.
+	 */
+	wmark += wmark * 150 / 100;
+
+	/*
+	 * arc_sys_free needs to be more than 2x the watermark, because
+	 * arc_wait_for_eviction() waits for half of arc_sys_free.  Bump this up
+	 * to 3x to ensure we're above it.
+	 */
+	arc_sys_free = wmark * 3 + allmem / 32;
 }
 
 void
@@ -348,15 +381,11 @@ int64_t
 arc_available_memory(void)
 {
 	int64_t lowest = INT64_MAX;
-	free_memory_reason_t r = FMR_UNKNOWN;
 
 	/* Every 100 calls, free a small amount */
 	if (spa_get_random(100) == 0)
 		lowest = -1024;
 
-	last_free_memory = lowest;
-	last_free_reason = r;
-
 	return (lowest);
 }
 
@@ -429,3 +458,8 @@ arc_prune_async(int64_t adjust)
 	}
 	mutex_exit(&arc_prune_mtx);
 }
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
+	"Limit on number of pages that ARC shrinker can reclaim at once");
+/* END CSTYLED */
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index dcf710ad1..3ec98917d 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -313,17 +313,38 @@ boolean_t arc_watch = B_FALSE;
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
-static zthr_t		*arc_reap_zthr;
+static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
-zthr_t		*arc_evict_zthr;
+static zthr_t *arc_evict_zthr;
 
-kmutex_t	arc_evict_lock;
-kcondvar_t	arc_evict_waiters_cv;
-boolean_t	arc_evict_needed = B_FALSE;
+static kmutex_t arc_evict_lock;
+static boolean_t arc_evict_needed = B_FALSE;
+
+/*
+ * Count of bytes evicted since boot.
+ */
+static uint64_t arc_evict_count;
+
+/*
+ * List of arc_evict_waiter_t's, representing threads waiting for the
+ * arc_evict_count to reach specific values.
+ */
+static list_t arc_evict_waiters;
+
+/*
+ * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
+ * the requested amount of data to be evicted.  For example, by default for
+ * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
+ * Since this is above 100%, it ensures that progress is made towards getting
+ * arc_size under arc_c.  Since this is finite, it ensures that allocations
+ * can still happen, even during the potentially long time that arc_size is
+ * more than arc_c.
+ */
+int zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
@@ -632,6 +653,7 @@ arc_state_t	*arc_mfu;
 #define	arc_dnode_size_limit	ARCSTAT(arcstat_dnode_limit)
 #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 /* size of all b_rabd's in entire arc */
 #define	arc_raw_size	ARCSTAT(arcstat_raw_size)
@@ -3859,6 +3881,20 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 	return (bytes_evicted);
 }
 
+static void
+arc_set_need_free(void)
+{
+	ASSERT(MUTEX_HELD(&arc_evict_lock));
+	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
+	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
+	if (aw == NULL) {
+		arc_need_free = MAX(-remaining, 0);
+	} else {
+		arc_need_free =
+		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
+	}
+}
+
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, int64_t bytes)
@@ -3938,29 +3974,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 			if (evicted != 0)
 				evict_count++;
 
-			/*
-			 * If arc_size isn't overflowing, signal any
-			 * threads that might happen to be waiting.
-			 *
-			 * For each header evicted, we wake up a single
-			 * thread. If we used cv_broadcast, we could
-			 * wake up "too many" threads causing arc_size
-			 * to significantly overflow arc_c; since
-			 * arc_get_data_impl() doesn't check for overflow
-			 * when it's woken up (it doesn't because it's
-			 * possible for the ARC to be overflowing while
-			 * full of un-evictable buffers, and the
-			 * function should proceed in this case).
-			 *
-			 * If threads are left sleeping, due to not
-			 * using cv_broadcast here, they will be woken
-			 * up via cv_broadcast in arc_evict_cb() just
-			 * before arc_evict_zthr sleeps.
-			 */
-			mutex_enter(&arc_evict_lock);
-			if (!arc_is_overflowing())
-				cv_signal(&arc_evict_waiters_cv);
-			mutex_exit(&arc_evict_lock);
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
@@ -3969,6 +3982,32 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	multilist_sublist_unlock(mls);
 
 	/*
+	 * Increment the count of evicted bytes, and wake up any threads that
+	 * are waiting for the count to reach this value.  Since the list is
+	 * ordered by ascending aew_count, we pop off the beginning of the
+	 * list until we reach the end, or a waiter that's past the current
+	 * "count".  Doing this outside the loop reduces the number of times
+	 * we need to acquire the global arc_evict_lock.
+	 *
+	 * Only wake when there's sufficient free memory in the system
+	 * (specifically, arc_sys_free/2, which by default is a bit more than
+	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
+	 */
+	mutex_enter(&arc_evict_lock);
+	arc_evict_count += bytes_evicted;
+
+	if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) {
+		arc_evict_waiter_t *aw;
+		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
+		    aw->aew_count <= arc_evict_count) {
+			list_remove(&arc_evict_waiters, aw);
+			cv_broadcast(&aw->aew_cv);
+		}
+	}
+	arc_set_need_free();
+	mutex_exit(&arc_evict_lock);
+
+	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
@@ -4582,7 +4621,16 @@ void
 arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t asize = aggsum_value(&arc_size);
-	uint64_t c = arc_c;
+
+	/*
+	 * All callers want the ARC to actually evict (at least) this much
+	 * memory.  Therefore we reduce from the lower of the current size and
+	 * the target size.  This way, even if arc_c is much higher than
+	 * arc_size (as can be the case after many calls to arc_freed(), we will
+	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
+	 * will evict.
+	 */
+	uint64_t c = MIN(arc_c, asize);
 
 	if (c > to_free && c - to_free > arc_c_min) {
 		arc_c = c - to_free;
@@ -4693,18 +4741,18 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 
 	/*
-	 * We have to rely on arc_get_data_impl() to tell us when to evict,
-	 * rather than checking if we are overflowing here, so that we are
-	 * sure to not leave arc_get_data_impl() waiting on
-	 * arc_evict_waiters_cv.  If we have become "not overflowing" since
-	 * arc_get_data_impl() checked, we need to wake it up.  We could
-	 * broadcast the CV here, but arc_get_data_impl() may have not yet
-	 * gone to sleep.  We would need to use a mutex to ensure that this
-	 * function doesn't broadcast until arc_get_data_impl() has gone to
-	 * sleep (e.g. the arc_evict_lock).  However, the lock ordering of
-	 * such a lock would necessarily be incorrect with respect to the
-	 * zthr_lock, which is held before this function is called, and is
-	 * held by arc_get_data_impl() when it calls zthr_wakeup().
+	 * We have to rely on arc_wait_for_eviction() to tell us when to
+	 * evict, rather than checking if we are overflowing here, so that we
+	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
+	 * If we have become "not overflowing" since arc_wait_for_eviction()
+	 * checked, we need to wake it up.  We could broadcast the CV here,
+	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
+	 * would need to use a mutex to ensure that this function doesn't
+	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
+	 * the arc_evict_lock).  However, the lock ordering of such a lock
+	 * would necessarily be incorrect with respect to the zthr_lock,
+	 * which is held before this function is called, and is held by
+	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	return (arc_evict_needed);
 }
@@ -4743,8 +4791,11 @@ arc_evict_cb(void *arg, zthr_t *zthr)
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
-		cv_broadcast(&arc_evict_waiters_cv);
-		arc_need_free = 0;
+		arc_evict_waiter_t *aw;
+		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
+			cv_broadcast(&aw->aew_cv);
+		}
+		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
@@ -4824,9 +4875,6 @@ arc_reap_cb(void *arg, zthr_t *zthr)
 	int64_t to_free =
 	    (arc_c >> arc_shrink_shift) - free_memory;
 	if (to_free > 0) {
-#ifdef _KERNEL
-		to_free = MAX(to_free, arc_need_free);
-#endif
 		arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
@@ -5008,6 +5056,64 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 }
 
 /*
+ * Wait for the specified amount of data (in bytes) to be evicted from the
+ * ARC, and for there to be sufficient free memory in the system.  Waiting for
+ * eviction ensures that the memory used by the ARC decreases.  Waiting for
+ * free memory ensures that the system won't run out of free pages, regardless
+ * of ARC behavior and settings.  See arc_lowmem_init().
+ */
+void
+arc_wait_for_eviction(uint64_t amount)
+{
+	mutex_enter(&arc_evict_lock);
+	if (arc_is_overflowing()) {
+		arc_evict_needed = B_TRUE;
+		zthr_wakeup(arc_evict_zthr);
+
+		if (amount != 0) {
+			arc_evict_waiter_t aw;
+			list_link_init(&aw.aew_node);
+			cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
+
+			arc_evict_waiter_t *last =
+			    list_tail(&arc_evict_waiters);
+			if (last != NULL) {
+				ASSERT3U(last->aew_count, >, arc_evict_count);
+				aw.aew_count = last->aew_count + amount;
+			} else {
+				aw.aew_count = arc_evict_count + amount;
+			}
+
+			list_insert_tail(&arc_evict_waiters, &aw);
+
+			arc_set_need_free();
+
+			DTRACE_PROBE3(arc__wait__for__eviction,
+			    uint64_t, amount,
+			    uint64_t, arc_evict_count,
+			    uint64_t, aw.aew_count);
+
+			/*
+			 * We will be woken up either when arc_evict_count
+			 * reaches aew_count, or when the ARC is no longer
+			 * overflowing and eviction completes.
+			 */
+			cv_wait(&aw.aew_cv, &arc_evict_lock);
+
+			/*
+			 * In case of "false" wakeup, we will still be on the
+			 * list.
+			 */
+			if (list_link_active(&aw.aew_node))
+				list_remove(&arc_evict_waiters, &aw);
+
+			cv_destroy(&aw.aew_cv);
+		}
+	}
+	mutex_exit(&arc_evict_lock);
+}
+
+/*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
@@ -5022,40 +5128,26 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 	arc_adapt(size, state);
 
 	/*
-	 * If arc_size is currently overflowing, and has grown past our
-	 * upper limit, we must be adding data faster than the evict
-	 * thread can evict. Thus, to ensure we don't compound the
+	 * If arc_size is currently overflowing, we must be adding data
+	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
-	 * further past it's target size, we halt and wait for the
-	 * eviction thread to catch up.
+	 * further past it's target size, we wait for the eviction thread to
+	 * make some progress.  We also wait for there to be sufficient free
+	 * memory in the system, as measured by arc_free_memory().
+	 *
+	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
+	 * requested size to be evicted.  This should be more than 100%, to
+	 * ensure that that progress is also made towards getting arc_size
+	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 *
-	 * It's also possible that the reclaim thread is unable to evict
-	 * enough buffers to get arc_size below the overflow limit (e.g.
-	 * due to buffers being un-evictable, or hash lock collisions).
-	 * In this case, we want to proceed regardless if we're
-	 * overflowing; thus we don't use a while loop here.
+	 * We do the overflowing check without holding the arc_evict_lock to
+	 * reduce lock contention in this hot path.  Note that
+	 * arc_wait_for_eviction() will acquire the lock and check again to
+	 * ensure we are truly overflowing before blocking.
 	 */
 	if (arc_is_overflowing()) {
-		mutex_enter(&arc_evict_lock);
-
-		/*
-		 * Now that we've acquired the lock, we may no longer be
-		 * over the overflow limit, lets check.
-		 *
-		 * We're ignoring the case of spurious wake ups. If that
-		 * were to happen, it'd let this thread consume an ARC
-		 * buffer before it should have (i.e. before we're under
-		 * the overflow limit and were signalled by the reclaim
-		 * thread). As long as that is a rare occurrence, it
-		 * shouldn't cause any harm.
-		 */
-		if (arc_is_overflowing()) {
-			arc_evict_needed = B_TRUE;
-			zthr_wakeup(arc_evict_zthr);
-			(void) cv_wait(&arc_evict_waiters_cv,
-			    &arc_evict_lock);
-		}
-		mutex_exit(&arc_evict_lock);
+		arc_wait_for_eviction(size *
+		    zfs_arc_eviction_pct / 100);
 	}
 
 	VERIFY3U(hdr->b_type, ==, type);
@@ -7269,7 +7361,8 @@ arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_evict_waiters_cv, NULL, CV_DEFAULT, NULL);
+	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
+	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
@@ -7402,7 +7495,7 @@ arc_fini(void)
 	(void) zthr_cancel(arc_reap_zthr);
 
 	mutex_destroy(&arc_evict_lock);
-	cv_destroy(&arc_evict_waiters_cv);
+	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
@@ -10357,4 +10450,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+       "When full, ARC allocation waits for eviction of this % of alloc size");
 /* END CSTYLED */