summaryrefslogtreecommitdiffstats
path: root/module/zfs/arc.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r--module/zfs/arc.c252
1 files changed, 174 insertions, 78 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index dcf710ad1..3ec98917d 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -313,17 +313,38 @@ boolean_t arc_watch = B_FALSE;
* calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
* arc_available_memory().
*/
-static zthr_t *arc_reap_zthr;
+static zthr_t *arc_reap_zthr;
/*
* This thread's job is to keep arc_size under arc_c, by calling
* arc_evict(), which improves arc_is_overflowing().
*/
-zthr_t *arc_evict_zthr;
+static zthr_t *arc_evict_zthr;
-kmutex_t arc_evict_lock;
-kcondvar_t arc_evict_waiters_cv;
-boolean_t arc_evict_needed = B_FALSE;
+static kmutex_t arc_evict_lock;
+static boolean_t arc_evict_needed = B_FALSE;
+
+/*
+ * Count of bytes evicted since boot.
+ */
+static uint64_t arc_evict_count;
+
+/*
+ * List of arc_evict_waiter_t's, representing threads waiting for the
+ * arc_evict_count to reach specific values.
+ */
+static list_t arc_evict_waiters;
+
+/*
+ * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
+ * the requested amount of data to be evicted. For example, by default for
+ * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
+ * Since this is above 100%, it ensures that progress is made towards getting
+ * arc_size under arc_c. Since this is finite, it ensures that allocations
+ * can still happen, even during the potentially long time that arc_size is
+ * more than arc_c.
+ */
+int zfs_arc_eviction_pct = 200;
/*
* The number of headers to evict in arc_evict_state_impl() before
@@ -632,6 +653,7 @@ arc_state_t *arc_mfu;
#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit)
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */
/* size of all b_rabd's in entire arc */
#define arc_raw_size ARCSTAT(arcstat_raw_size)
@@ -3859,6 +3881,20 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
return (bytes_evicted);
}
+static void
+arc_set_need_free(void)
+{
+ ASSERT(MUTEX_HELD(&arc_evict_lock));
+ int64_t remaining = arc_free_memory() - arc_sys_free / 2;
+ arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
+ if (aw == NULL) {
+ arc_need_free = MAX(-remaining, 0);
+ } else {
+ arc_need_free =
+ MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
+ }
+}
+
static uint64_t
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, int64_t bytes)
@@ -3938,29 +3974,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
if (evicted != 0)
evict_count++;
- /*
- * If arc_size isn't overflowing, signal any
- * threads that might happen to be waiting.
- *
- * For each header evicted, we wake up a single
- * thread. If we used cv_broadcast, we could
- * wake up "too many" threads causing arc_size
- * to significantly overflow arc_c; since
- * arc_get_data_impl() doesn't check for overflow
- * when it's woken up (it doesn't because it's
- * possible for the ARC to be overflowing while
- * full of un-evictable buffers, and the
- * function should proceed in this case).
- *
- * If threads are left sleeping, due to not
- * using cv_broadcast here, they will be woken
- * up via cv_broadcast in arc_evict_cb() just
- * before arc_evict_zthr sleeps.
- */
- mutex_enter(&arc_evict_lock);
- if (!arc_is_overflowing())
- cv_signal(&arc_evict_waiters_cv);
- mutex_exit(&arc_evict_lock);
} else {
ARCSTAT_BUMP(arcstat_mutex_miss);
}
@@ -3969,6 +3982,32 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
multilist_sublist_unlock(mls);
/*
+ * Increment the count of evicted bytes, and wake up any threads that
+ * are waiting for the count to reach this value. Since the list is
+ * ordered by ascending aew_count, we pop off the beginning of the
+ * list until we reach the end, or a waiter that's past the current
+ * "count". Doing this outside the loop reduces the number of times
+ * we need to acquire the global arc_evict_lock.
+ *
+ * Only wake when there's sufficient free memory in the system
+ * (specifically, arc_sys_free/2, which by default is a bit more than
+ * 1/64th of RAM). See the comments in arc_wait_for_eviction().
+ */
+ mutex_enter(&arc_evict_lock);
+ arc_evict_count += bytes_evicted;
+
+ if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) {
+ arc_evict_waiter_t *aw;
+ while ((aw = list_head(&arc_evict_waiters)) != NULL &&
+ aw->aew_count <= arc_evict_count) {
+ list_remove(&arc_evict_waiters, aw);
+ cv_broadcast(&aw->aew_cv);
+ }
+ }
+ arc_set_need_free();
+ mutex_exit(&arc_evict_lock);
+
+ /*
* If the ARC size is reduced from arc_c_max to arc_c_min (especially
* if the average cached block is small), eviction can be on-CPU for
* many seconds. To ensure that other threads that may be bound to
@@ -4582,7 +4621,16 @@ void
arc_reduce_target_size(int64_t to_free)
{
uint64_t asize = aggsum_value(&arc_size);
- uint64_t c = arc_c;
+
+ /*
+ * All callers want the ARC to actually evict (at least) this much
+ * memory. Therefore we reduce from the lower of the current size and
+ * the target size. This way, even if arc_c is much higher than
+ * arc_size (as can be the case after many calls to arc_freed(), we will
+ * immediately have arc_c < arc_size and therefore the arc_evict_zthr
+ * will evict.
+ */
+ uint64_t c = MIN(arc_c, asize);
if (c > to_free && c - to_free > arc_c_min) {
arc_c = c - to_free;
@@ -4693,18 +4741,18 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
arc_ksp->ks_update(arc_ksp, KSTAT_READ);
/*
- * We have to rely on arc_get_data_impl() to tell us when to evict,
- * rather than checking if we are overflowing here, so that we are
- * sure to not leave arc_get_data_impl() waiting on
- * arc_evict_waiters_cv. If we have become "not overflowing" since
- * arc_get_data_impl() checked, we need to wake it up. We could
- * broadcast the CV here, but arc_get_data_impl() may have not yet
- * gone to sleep. We would need to use a mutex to ensure that this
- * function doesn't broadcast until arc_get_data_impl() has gone to
- * sleep (e.g. the arc_evict_lock). However, the lock ordering of
- * such a lock would necessarily be incorrect with respect to the
- * zthr_lock, which is held before this function is called, and is
- * held by arc_get_data_impl() when it calls zthr_wakeup().
+ * We have to rely on arc_wait_for_eviction() to tell us when to
+ * evict, rather than checking if we are overflowing here, so that we
+ * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
+ * If we have become "not overflowing" since arc_wait_for_eviction()
+ * checked, we need to wake it up. We could broadcast the CV here,
+ * but arc_wait_for_eviction() may have not yet gone to sleep. We
+ * would need to use a mutex to ensure that this function doesn't
+ * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
+ * the arc_evict_lock). However, the lock ordering of such a lock
+ * would necessarily be incorrect with respect to the zthr_lock,
+ * which is held before this function is called, and is held by
+ * arc_wait_for_eviction() when it calls zthr_wakeup().
*/
return (arc_evict_needed);
}
@@ -4743,8 +4791,11 @@ arc_evict_cb(void *arg, zthr_t *zthr)
* can't evict anything more, so we should wake
* arc_get_data_impl() sooner.
*/
- cv_broadcast(&arc_evict_waiters_cv);
- arc_need_free = 0;
+ arc_evict_waiter_t *aw;
+ while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
+ cv_broadcast(&aw->aew_cv);
+ }
+ arc_set_need_free();
}
mutex_exit(&arc_evict_lock);
spl_fstrans_unmark(cookie);
@@ -4824,9 +4875,6 @@ arc_reap_cb(void *arg, zthr_t *zthr)
int64_t to_free =
(arc_c >> arc_shrink_shift) - free_memory;
if (to_free > 0) {
-#ifdef _KERNEL
- to_free = MAX(to_free, arc_need_free);
-#endif
arc_reduce_target_size(to_free);
}
spl_fstrans_unmark(cookie);
@@ -5008,6 +5056,64 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
}
/*
+ * Wait for the specified amount of data (in bytes) to be evicted from the
+ * ARC, and for there to be sufficient free memory in the system. Waiting for
+ * eviction ensures that the memory used by the ARC decreases. Waiting for
+ * free memory ensures that the system won't run out of free pages, regardless
+ * of ARC behavior and settings. See arc_lowmem_init().
+ */
+void
+arc_wait_for_eviction(uint64_t amount)
+{
+ mutex_enter(&arc_evict_lock);
+ if (arc_is_overflowing()) {
+ arc_evict_needed = B_TRUE;
+ zthr_wakeup(arc_evict_zthr);
+
+ if (amount != 0) {
+ arc_evict_waiter_t aw;
+ list_link_init(&aw.aew_node);
+ cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
+
+ arc_evict_waiter_t *last =
+ list_tail(&arc_evict_waiters);
+ if (last != NULL) {
+ ASSERT3U(last->aew_count, >, arc_evict_count);
+ aw.aew_count = last->aew_count + amount;
+ } else {
+ aw.aew_count = arc_evict_count + amount;
+ }
+
+ list_insert_tail(&arc_evict_waiters, &aw);
+
+ arc_set_need_free();
+
+ DTRACE_PROBE3(arc__wait__for__eviction,
+ uint64_t, amount,
+ uint64_t, arc_evict_count,
+ uint64_t, aw.aew_count);
+
+ /*
+ * We will be woken up either when arc_evict_count
+ * reaches aew_count, or when the ARC is no longer
+ * overflowing and eviction completes.
+ */
+ cv_wait(&aw.aew_cv, &arc_evict_lock);
+
+ /*
+ * In case of "false" wakeup, we will still be on the
+ * list.
+ */
+ if (list_link_active(&aw.aew_node))
+ list_remove(&arc_evict_waiters, &aw);
+
+ cv_destroy(&aw.aew_cv);
+ }
+ }
+ mutex_exit(&arc_evict_lock);
+}
+
+/*
* Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard
@@ -5022,40 +5128,26 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
arc_adapt(size, state);
/*
- * If arc_size is currently overflowing, and has grown past our
- * upper limit, we must be adding data faster than the evict
- * thread can evict. Thus, to ensure we don't compound the
+ * If arc_size is currently overflowing, we must be adding data
+ * faster than we are evicting. To ensure we don't compound the
* problem by adding more data and forcing arc_size to grow even
- * further past it's target size, we halt and wait for the
- * eviction thread to catch up.
+ * further past it's target size, we wait for the eviction thread to
+ * make some progress. We also wait for there to be sufficient free
+ * memory in the system, as measured by arc_free_memory().
+ *
+ * Specifically, we wait for zfs_arc_eviction_pct percent of the
+ * requested size to be evicted. This should be more than 100%, to
+ * ensure that that progress is also made towards getting arc_size
+ * under arc_c. See the comment above zfs_arc_eviction_pct.
*
- * It's also possible that the reclaim thread is unable to evict
- * enough buffers to get arc_size below the overflow limit (e.g.
- * due to buffers being un-evictable, or hash lock collisions).
- * In this case, we want to proceed regardless if we're
- * overflowing; thus we don't use a while loop here.
+ * We do the overflowing check without holding the arc_evict_lock to
+ * reduce lock contention in this hot path. Note that
+ * arc_wait_for_eviction() will acquire the lock and check again to
+ * ensure we are truly overflowing before blocking.
*/
if (arc_is_overflowing()) {
- mutex_enter(&arc_evict_lock);
-
- /*
- * Now that we've acquired the lock, we may no longer be
- * over the overflow limit, lets check.
- *
- * We're ignoring the case of spurious wake ups. If that
- * were to happen, it'd let this thread consume an ARC
- * buffer before it should have (i.e. before we're under
- * the overflow limit and were signalled by the reclaim
- * thread). As long as that is a rare occurrence, it
- * shouldn't cause any harm.
- */
- if (arc_is_overflowing()) {
- arc_evict_needed = B_TRUE;
- zthr_wakeup(arc_evict_zthr);
- (void) cv_wait(&arc_evict_waiters_cv,
- &arc_evict_lock);
- }
- mutex_exit(&arc_evict_lock);
+ arc_wait_for_eviction(size *
+ zfs_arc_eviction_pct / 100);
}
VERIFY3U(hdr->b_type, ==, type);
@@ -7269,7 +7361,8 @@ arc_init(void)
{
uint64_t percent, allmem = arc_all_memory();
mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_evict_waiters_cv, NULL, CV_DEFAULT, NULL);
+ list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
+ offsetof(arc_evict_waiter_t, aew_node));
arc_min_prefetch_ms = 1000;
arc_min_prescient_prefetch_ms = 6000;
@@ -7402,7 +7495,7 @@ arc_fini(void)
(void) zthr_cancel(arc_reap_zthr);
mutex_destroy(&arc_evict_lock);
- cv_destroy(&arc_evict_waiters_cv);
+ list_destroy(&arc_evict_waiters);
/*
* buf_fini() must proceed arc_state_fini() because buf_fin() may
@@ -10357,4 +10450,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
"Percentage of excess dnodes to try to unpin");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+ "When full, ARC allocation waits for eviction of this % of alloc size");
/* END CSTYLED */