diff options
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r-- | module/zfs/arc.c | 252 |
1 files changed, 174 insertions, 78 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dcf710ad1..3ec98917d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -313,17 +313,38 @@ boolean_t arc_watch = B_FALSE; * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ -static zthr_t *arc_reap_zthr; +static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ -zthr_t *arc_evict_zthr; +static zthr_t *arc_evict_zthr; -kmutex_t arc_evict_lock; -kcondvar_t arc_evict_waiters_cv; -boolean_t arc_evict_needed = B_FALSE; +static kmutex_t arc_evict_lock; +static boolean_t arc_evict_needed = B_FALSE; + +/* + * Count of bytes evicted since boot. + */ +static uint64_t arc_evict_count; + +/* + * List of arc_evict_waiter_t's, representing threads waiting for the + * arc_evict_count to reach specific values. + */ +static list_t arc_evict_waiters; + +/* + * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of + * the requested amount of data to be evicted. For example, by default for + * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. + * Since this is above 100%, it ensures that progress is made towards getting + * arc_size under arc_c. Since this is finite, it ensures that allocations + * can still happen, even during the potentially long time that arc_size is + * more than arc_c. + */ +int zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before @@ -632,6 +653,7 @@ arc_state_t *arc_mfu; #define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ /* size of all b_rabd's in entire arc */ #define arc_raw_size ARCSTAT(arcstat_raw_size) @@ -3859,6 +3881,20 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) return (bytes_evicted); } +static void +arc_set_need_free(void) +{ + ASSERT(MUTEX_HELD(&arc_evict_lock)); + int64_t remaining = arc_free_memory() - arc_sys_free / 2; + arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); + if (aw == NULL) { + arc_need_free = MAX(-remaining, 0); + } else { + arc_need_free = + MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); + } +} + static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) @@ -3938,29 +3974,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, if (evicted != 0) evict_count++; - /* - * If arc_size isn't overflowing, signal any - * threads that might happen to be waiting. - * - * For each header evicted, we wake up a single - * thread. If we used cv_broadcast, we could - * wake up "too many" threads causing arc_size - * to significantly overflow arc_c; since - * arc_get_data_impl() doesn't check for overflow - * when it's woken up (it doesn't because it's - * possible for the ARC to be overflowing while - * full of un-evictable buffers, and the - * function should proceed in this case). - * - * If threads are left sleeping, due to not - * using cv_broadcast here, they will be woken - * up via cv_broadcast in arc_evict_cb() just - * before arc_evict_zthr sleeps. - */ - mutex_enter(&arc_evict_lock); - if (!arc_is_overflowing()) - cv_signal(&arc_evict_waiters_cv); - mutex_exit(&arc_evict_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -3969,6 +3982,32 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, multilist_sublist_unlock(mls); /* + * Increment the count of evicted bytes, and wake up any threads that + * are waiting for the count to reach this value. Since the list is + * ordered by ascending aew_count, we pop off the beginning of the + * list until we reach the end, or a waiter that's past the current + * "count". Doing this outside the loop reduces the number of times + * we need to acquire the global arc_evict_lock. + * + * Only wake when there's sufficient free memory in the system + * (specifically, arc_sys_free/2, which by default is a bit more than + * 1/64th of RAM). See the comments in arc_wait_for_eviction(). + */ + mutex_enter(&arc_evict_lock); + arc_evict_count += bytes_evicted; + + if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) { + arc_evict_waiter_t *aw; + while ((aw = list_head(&arc_evict_waiters)) != NULL && + aw->aew_count <= arc_evict_count) { + list_remove(&arc_evict_waiters, aw); + cv_broadcast(&aw->aew_cv); + } + } + arc_set_need_free(); + mutex_exit(&arc_evict_lock); + + /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to @@ -4582,7 +4621,16 @@ void arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_size); - uint64_t c = arc_c; + + /* + * All callers want the ARC to actually evict (at least) this much + * memory. Therefore we reduce from the lower of the current size and + * the target size. This way, even if arc_c is much higher than + * arc_size (as can be the case after many calls to arc_freed(), we will + * immediately have arc_c < arc_size and therefore the arc_evict_zthr + * will evict. + */ + uint64_t c = MIN(arc_c, asize); if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; @@ -4693,18 +4741,18 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) arc_ksp->ks_update(arc_ksp, KSTAT_READ); /* - * We have to rely on arc_get_data_impl() to tell us when to evict, - * rather than checking if we are overflowing here, so that we are - * sure to not leave arc_get_data_impl() waiting on - * arc_evict_waiters_cv. If we have become "not overflowing" since - * arc_get_data_impl() checked, we need to wake it up. We could - * broadcast the CV here, but arc_get_data_impl() may have not yet - * gone to sleep. We would need to use a mutex to ensure that this - * function doesn't broadcast until arc_get_data_impl() has gone to - * sleep (e.g. the arc_evict_lock). However, the lock ordering of - * such a lock would necessarily be incorrect with respect to the - * zthr_lock, which is held before this function is called, and is - * held by arc_get_data_impl() when it calls zthr_wakeup(). + * We have to rely on arc_wait_for_eviction() to tell us when to + * evict, rather than checking if we are overflowing here, so that we + * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. + * If we have become "not overflowing" since arc_wait_for_eviction() + * checked, we need to wake it up. We could broadcast the CV here, + * but arc_wait_for_eviction() may have not yet gone to sleep. We + * would need to use a mutex to ensure that this function doesn't + * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. + * the arc_evict_lock). However, the lock ordering of such a lock + * would necessarily be incorrect with respect to the zthr_lock, + * which is held before this function is called, and is held by + * arc_wait_for_eviction() when it calls zthr_wakeup(). */ return (arc_evict_needed); } @@ -4743,8 +4791,11 @@ arc_evict_cb(void *arg, zthr_t *zthr) * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ - cv_broadcast(&arc_evict_waiters_cv); - arc_need_free = 0; + arc_evict_waiter_t *aw; + while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { + cv_broadcast(&aw->aew_cv); + } + arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); @@ -4824,9 +4875,6 @@ arc_reap_cb(void *arg, zthr_t *zthr) int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { -#ifdef _KERNEL - to_free = MAX(to_free, arc_need_free); -#endif arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); @@ -5008,6 +5056,64 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) } /* + * Wait for the specified amount of data (in bytes) to be evicted from the + * ARC, and for there to be sufficient free memory in the system. Waiting for + * eviction ensures that the memory used by the ARC decreases. Waiting for + * free memory ensures that the system won't run out of free pages, regardless + * of ARC behavior and settings. See arc_lowmem_init(). + */ +void +arc_wait_for_eviction(uint64_t amount) +{ + mutex_enter(&arc_evict_lock); + if (arc_is_overflowing()) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + + if (amount != 0) { + arc_evict_waiter_t aw; + list_link_init(&aw.aew_node); + cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); + + arc_evict_waiter_t *last = + list_tail(&arc_evict_waiters); + if (last != NULL) { + ASSERT3U(last->aew_count, >, arc_evict_count); + aw.aew_count = last->aew_count + amount; + } else { + aw.aew_count = arc_evict_count + amount; + } + + list_insert_tail(&arc_evict_waiters, &aw); + + arc_set_need_free(); + + DTRACE_PROBE3(arc__wait__for__eviction, + uint64_t, amount, + uint64_t, arc_evict_count, + uint64_t, aw.aew_count); + + /* + * We will be woken up either when arc_evict_count + * reaches aew_count, or when the ARC is no longer + * overflowing and eviction completes. + */ + cv_wait(&aw.aew_cv, &arc_evict_lock); + + /* + * In case of "false" wakeup, we will still be on the + * list. + */ + if (list_link_active(&aw.aew_node)) + list_remove(&arc_evict_waiters, &aw); + + cv_destroy(&aw.aew_cv); + } + } + mutex_exit(&arc_evict_lock); +} + +/* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard @@ -5022,40 +5128,26 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) arc_adapt(size, state); /* - * If arc_size is currently overflowing, and has grown past our - * upper limit, we must be adding data faster than the evict - * thread can evict. Thus, to ensure we don't compound the + * If arc_size is currently overflowing, we must be adding data + * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even - * further past it's target size, we halt and wait for the - * eviction thread to catch up. + * further past it's target size, we wait for the eviction thread to + * make some progress. We also wait for there to be sufficient free + * memory in the system, as measured by arc_free_memory(). + * + * Specifically, we wait for zfs_arc_eviction_pct percent of the + * requested size to be evicted. This should be more than 100%, to + * ensure that that progress is also made towards getting arc_size + * under arc_c. See the comment above zfs_arc_eviction_pct. * - * It's also possible that the reclaim thread is unable to evict - * enough buffers to get arc_size below the overflow limit (e.g. - * due to buffers being un-evictable, or hash lock collisions). - * In this case, we want to proceed regardless if we're - * overflowing; thus we don't use a while loop here. + * We do the overflowing check without holding the arc_evict_lock to + * reduce lock contention in this hot path. Note that + * arc_wait_for_eviction() will acquire the lock and check again to + * ensure we are truly overflowing before blocking. */ if (arc_is_overflowing()) { - mutex_enter(&arc_evict_lock); - - /* - * Now that we've acquired the lock, we may no longer be - * over the overflow limit, lets check. - * - * We're ignoring the case of spurious wake ups. If that - * were to happen, it'd let this thread consume an ARC - * buffer before it should have (i.e. before we're under - * the overflow limit and were signalled by the reclaim - * thread). As long as that is a rare occurrence, it - * shouldn't cause any harm. - */ - if (arc_is_overflowing()) { - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - (void) cv_wait(&arc_evict_waiters_cv, - &arc_evict_lock); - } - mutex_exit(&arc_evict_lock); + arc_wait_for_eviction(size * + zfs_arc_eviction_pct / 100); } VERIFY3U(hdr->b_type, ==, type); @@ -7269,7 +7361,8 @@ arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_evict_waiters_cv, NULL, CV_DEFAULT, NULL); + list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), + offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; @@ -7402,7 +7495,7 @@ arc_fini(void) (void) zthr_cancel(arc_reap_zthr); mutex_destroy(&arc_evict_lock); - cv_destroy(&arc_evict_waiters_cv); + list_destroy(&arc_evict_waiters); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may @@ -10357,4 +10450,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, + "When full, ARC allocation waits for eviction of this % of alloc size"); /* END CSTYLED */ |