summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/zfs_context.h5
-rw-r--r--lib/libzpool/kernel.c1
-rw-r--r--module/zfs/arc.c525
3 files changed, 374 insertions, 157 deletions
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index acd370b23..7652a9cae 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -410,9 +410,12 @@ extern void kstat_set_raw_ops(kstat_t *ksp,
#define kmem_debugging() 0
#define kmem_cache_reap_now(_c) umem_cache_reap_now(_c);
#define kmem_cache_set_move(_c, _cb) /* nothing */
+#define vmem_qcache_reap(_v) /* nothing */
#define POINTER_INVALIDATE(_pp) /* nothing */
#define POINTER_IS_VALID(_p) 0
+extern vmem_t *zio_arena;
+
typedef umem_cache_t kmem_cache_t;
typedef enum kmem_cbrc {
@@ -610,7 +613,7 @@ extern void delay(clock_t ticks);
} while (0);
#define max_ncpus 64
-#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN))
+#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN))
#define minclsyspri 60
#define maxclsyspri 99
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 85fe24afd..80da41151 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -48,6 +48,7 @@ uint64_t physmem;
vnode_t *rootdir = (vnode_t *)0xabcd1234;
char hw_serial[HW_HOSTID_LEN];
struct utsname hw_utsname;
+vmem_t *zio_arena = NULL;
/* this only exists to have its address taken */
struct proc p0;
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index df7845b2f..d734a170f 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -164,17 +164,6 @@ static kmutex_t arc_user_evicts_lock;
static kcondvar_t arc_user_evicts_cv;
static boolean_t arc_user_evicts_thread_exit;
-/* number of objects to prune from caches when arc_meta_limit is reached */
-int zfs_arc_meta_prune = 10000;
-
-/* The preferred strategy to employ when arc_meta_limit is reached */
-int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
-
-typedef enum arc_reclaim_strategy {
- ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
- ARC_RECLAIM_CONS /* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
-
/*
* The number of headers to evict in arc_evict_state_impl() before
* dropping the sublist lock and evicting from another sublist. A lower
@@ -192,40 +181,31 @@ int zfs_arc_evict_batch_limit = 10;
int zfs_arc_num_sublists_per_state = 0;
/* number of seconds before growing cache again */
-int zfs_arc_grow_retry = 5;
+static int arc_grow_retry = 5;
/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
-int zfs_arc_overflow_shift = 8;
-
-/* disable anon data aggressively growing arc_p */
-int zfs_arc_p_aggressive_disable = 1;
-
-/* disable arc_p adapt dampener in arc_adapt */
-int zfs_arc_p_dampener_disable = 1;
+int zfs_arc_overflow_shift = 8;
/* log2(fraction of arc to reclaim) */
-int zfs_arc_shrink_shift = 5;
+static int arc_shrink_shift = 7;
/*
- * minimum lifespan of a prefetch block in clock ticks
- * (initialized in arc_init())
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
*/
-int zfs_arc_min_prefetch_lifespan = HZ;
+int arc_no_grow_shift = 5;
-/* disable arc proactive arc throttle due to low memory */
-int zfs_arc_memory_throttle_disable = 1;
-
-/* disable duplicate buffer eviction */
-int zfs_disable_dup_eviction = 0;
-
-/* average block used to size buf_hash_table */
-int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
-static int arc_min_prefetch_lifespan;
+static int arc_min_prefetch_lifespan;
/*
* If this percent of memory is free, don't throttle.
@@ -234,9 +214,6 @@ int arc_lotsfree_percent = 10;
static int arc_dead;
-/* expiration time for arc_no_grow */
-static clock_t arc_grow_time = 0;
-
/*
* The arc has filled available memory and has now warmed up.
*/
@@ -249,11 +226,21 @@ unsigned long zfs_arc_max = 0;
unsigned long zfs_arc_min = 0;
unsigned long zfs_arc_meta_limit = 0;
unsigned long zfs_arc_meta_min = 0;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_disable_dup_eviction = 0;
+int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
/*
- * Limit the number of restarts in arc_adjust_meta()
+ * These tunables are Linux specific
*/
-unsigned long zfs_arc_meta_adjust_restarts = 4096;
+int zfs_arc_memory_throttle_disable = 1;
+int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_p_aggressive_disable = 1;
+int zfs_arc_p_dampener_disable = 1;
+int zfs_arc_meta_prune = 10000;
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
+int zfs_arc_meta_adjust_restarts = 4096;
/* The 6 states: */
static arc_state_t ARC_anon;
@@ -689,6 +676,7 @@ static void arc_get_data_buf(arc_buf_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
+static void arc_tuning_update(void);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -2535,7 +2523,7 @@ arc_prune_task(void *ptr)
* is analogous to dnlc_reduce_cache() but more generic.
*
* This operation is performed asyncronously so it may be safely called
- * in the context of the arc_adapt_thread(). A reference is taken here
+ * in the context of the arc_reclaim_thread(). A reference is taken here
* for each registered arc_prune_t and the arc_prune_task() is responsible
* for releasing it once the registered arc_prune_func_t has completed.
*/
@@ -2611,7 +2599,7 @@ arc_adjust_meta_balanced(void)
int64_t adjustmnt, delta, prune = 0;
uint64_t total_evicted = 0;
arc_buf_contents_t type = ARC_BUFC_DATA;
- unsigned long restarts = zfs_arc_meta_adjust_restarts;
+ int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
restart:
/*
@@ -3004,25 +2992,16 @@ arc_flush(spa_t *spa, boolean_t retry)
}
void
-arc_shrink(uint64_t bytes)
+arc_shrink(int64_t to_free)
{
if (arc_c > arc_c_min) {
- uint64_t to_free;
-
- to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
if (arc_c > arc_c_min + to_free)
atomic_add_64(&arc_c, -to_free);
else
arc_c = arc_c_min;
- to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
-
- if (arc_p > to_free)
- atomic_add_64(&arc_p, -to_free);
- else
- arc_p = 0;
-
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
if (arc_c > arc_size)
arc_c = MAX(arc_size, arc_c_min);
if (arc_p > arc_c)
@@ -3035,8 +3014,181 @@ arc_shrink(uint64_t bytes)
(void) arc_adjust();
}
+typedef enum free_memory_reason_t {
+ FMR_UNKNOWN,
+ FMR_NEEDFREE,
+ FMR_LOTSFREE,
+ FMR_SWAPFS_MINFREE,
+ FMR_PAGES_PP_MAXIMUM,
+ FMR_HEAP_ARENA,
+ FMR_ZIO_ARENA,
+} free_memory_reason_t;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+#ifdef _KERNEL
+#ifdef __linux__
+/*
+ * expiration time for arc_no_grow set by direct memory reclaim.
+ */
+static clock_t arc_grow_time = 0;
+#else
+/*
+ * Additional reserve of pages for pp_reserve.
+ */
+int64_t arc_pages_pp_reserve = 64;
+
+/*
+ * Additional reserve of pages for swapfs.
+ */
+int64_t arc_swapfs_reserve = 64;
+#endif
+#endif /* _KERNEL */
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed. Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+static int64_t
+arc_available_memory(void)
+{
+ int64_t lowest = INT64_MAX;
+ free_memory_reason_t r = FMR_UNKNOWN;
+
+#ifdef _KERNEL
+#ifdef __linux__
+ /*
+ * Under Linux we are not allowed to directly interrogate the global
+ * memory state. Instead rely on observing that direct reclaim has
+ * recently occurred therefore the system must be low on memory. The
+ * exact values returned are not critical but should be small.
+ */
+ if (ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
+ lowest = PAGE_SIZE;
+ else
+ lowest = -PAGE_SIZE;
+#else
+ int64_t n;
+
+ /*
+ * Platforms like illumos have greater visibility in to the memory
+ * subsystem and can return a more detailed analysis of memory.
+ */
+ if (needfree > 0) {
+ n = PAGESIZE * (-needfree);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_NEEDFREE;
+ }
+ }
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_LOTSFREE;
+ }
+
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
+ desfree - arc_swapfs_reserve);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_SWAPFS_MINFREE;
+ }
+
+
+ /*
+ * Check that we have enough availrmem that memory locking (e.g., via
+ * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
+ * stores the number of pages that cannot be locked; when availrmem
+ * drops below pages_pp_maximum, page locking mechanisms such as
+ * page_pp_lock() will fail.)
+ */
+ n = PAGESIZE * (availrmem - pages_pp_maximum -
+ arc_pages_pp_reserve);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_PAGES_PP_MAXIMUM;
+ }
+
+#if defined(__i386)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
+ * free)
+ */
+ n = vmem_size(heap_arena, VMEM_FREE) -
+ (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_HEAP_ARENA;
+ }
+#endif
+
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this arena remains
+ * above about 1/16th free.
+ *
+ * Note: The 1/16th arena free requirement was put in place
+ * to aggressively evict memory from the arc in order to avoid
+ * memory fragmentation issues.
+ */
+ if (zio_arena != NULL) {
+ n = vmem_size(zio_arena, VMEM_FREE) -
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_ZIO_ARENA;
+ }
+ }
+#endif /* __linux__ */
+#else
+ /* Every 100 calls, free a small amount */
+ if (spa_get_random(100) == 0)
+ lowest = -1024;
+#endif
+
+ last_free_memory = lowest;
+ last_free_reason = r;
+
+ return (lowest);
+}
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+static boolean_t
+arc_reclaim_needed(void)
+{
+ return (arc_available_memory() < 0);
+}
+
static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
+arc_kmem_reap_now(void)
{
size_t i;
kmem_cache_t *prev_cache = NULL;
@@ -3053,13 +3205,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
arc_prune(zfs_arc_meta_prune);
}
- /*
- * An aggressive reclamation will shrink the cache size as well as
- * reap free buffers from the arc kmem caches.
- */
- if (strat == ARC_RECLAIM_AGGR)
- arc_shrink(bytes);
-
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
if (zio_buf_cache[i] != prev_cache) {
prev_cache = zio_buf_cache[i];
@@ -3070,11 +3215,18 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
-
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_full_cache);
kmem_cache_reap_now(hdr_l2only_cache);
kmem_cache_reap_now(range_seg_cache);
+
+ if (zio_arena != NULL) {
+ /*
+ * Ask the vmem arena to reclaim unused memory from its
+ * quantum caches.
+ */
+ vmem_qcache_reap(zio_arena);
+ }
}
/*
@@ -3094,88 +3246,90 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
* using mutex_tryenter() from arc_reclaim_thread().
*/
static void
-arc_adapt_thread(void)
+arc_reclaim_thread(void)
{
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ clock_t growtime = 0;
callb_cpr_t cpr;
- fstrans_cookie_t cookie;
- uint64_t arc_evicted;
CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
- cookie = spl_fstrans_mark();
mutex_enter(&arc_reclaim_lock);
- while (arc_reclaim_thread_exit == 0) {
-#ifndef _KERNEL
- arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ while (!arc_reclaim_thread_exit) {
+ int64_t to_free;
+ int64_t free_memory = arc_available_memory();
+ uint64_t evicted = 0;
- mutex_exit(&arc_reclaim_lock);
- if (spa_get_random(100) == 0) {
+ arc_tuning_update();
- if (arc_no_grow) {
- if (last_reclaim == ARC_RECLAIM_CONS) {
- last_reclaim = ARC_RECLAIM_AGGR;
- } else {
- last_reclaim = ARC_RECLAIM_CONS;
- }
- } else {
- arc_no_grow = TRUE;
- last_reclaim = ARC_RECLAIM_AGGR;
- membar_producer();
- }
+ mutex_exit(&arc_reclaim_lock);
- /* reset the growth delay for every reclaim */
- arc_grow_time = ddi_get_lbolt() +
- (zfs_arc_grow_retry * hz);
+ if (free_memory < 0) {
- arc_kmem_reap_now(last_reclaim, 0);
+ arc_no_grow = B_TRUE;
arc_warm = B_TRUE;
- }
-#else /* _KERNEL */
- mutex_exit(&arc_reclaim_lock);
-#endif /* !_KERNEL */
-
- /* No recent memory pressure allow the ARC to grow. */
- if (arc_no_grow &&
- ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
- arc_no_grow = FALSE;
- arc_evicted = arc_adjust();
+ /*
+ * Wait at least zfs_grow_retry (default 5) seconds
+ * before considering growing.
+ */
+ growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
- /*
- * We're either no longer overflowing, or we
- * can't evict anything more, so we should wake
- * up any threads before we go to sleep.
- */
- if (arc_size <= arc_c || arc_evicted == 0)
- cv_broadcast(&arc_reclaim_waiters_cv);
+ arc_kmem_reap_now();
- mutex_enter(&arc_reclaim_lock);
+ /*
+ * If we are still low on memory, shrink the ARC
+ * so that we have arc_shrink_min free space.
+ */
+ free_memory = arc_available_memory();
- /* block until needed, or one second, whichever is shorter */
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_sig(&arc_reclaim_thread_cv,
- &arc_reclaim_lock, (ddi_get_lbolt() + hz));
- CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+ to_free = (arc_c >> arc_shrink_shift) - free_memory;
+ if (to_free > 0) {
+#ifdef _KERNEL
+ to_free = MAX(to_free, ptob(needfree));
+#endif
+ arc_shrink(to_free);
+ }
+ } else if (free_memory < arc_c >> arc_no_grow_shift) {
+ arc_no_grow = B_TRUE;
+ } else if (ddi_get_lbolt() >= growtime) {
+ arc_no_grow = B_FALSE;
+ }
+ evicted = arc_adjust();
- /* Allow the module options to be changed */
- if (zfs_arc_max > 64 << 20 &&
- zfs_arc_max < physmem * PAGESIZE &&
- zfs_arc_max != arc_c_max)
- arc_c_max = zfs_arc_max;
+ mutex_enter(&arc_reclaim_lock);
- if (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT &&
- zfs_arc_min <= arc_c_max &&
- zfs_arc_min != arc_c_min)
- arc_c_min = zfs_arc_min;
+ /*
+ * If evicted is zero, we couldn't evict anything via
+ * arc_adjust(). This could be due to hash lock
+ * collisions, but more likely due to the majority of
+ * arc buffers being unevictable. Therefore, even if
+ * arc_size is above arc_c, another pass is unlikely to
+ * be helpful and could potentially cause us to enter an
+ * infinite loop.
+ */
+ if (arc_size <= arc_c || evicted == 0) {
+ /*
+ * We're either no longer overflowing, or we
+ * can't evict anything more, so we should wake
+ * up any threads before we go to sleep.
+ */
+ cv_broadcast(&arc_reclaim_waiters_cv);
- if (zfs_arc_meta_limit > 0 &&
- zfs_arc_meta_limit <= arc_c_max &&
- zfs_arc_meta_limit != arc_meta_limit)
- arc_meta_limit = zfs_arc_meta_limit;
+ /*
+ * Block until signaled, or after one second (we
+ * might need to perform arc_kmem_reap_now()
+ * even if we aren't being signalled)
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_sig(&arc_reclaim_thread_cv,
+ &arc_reclaim_lock, ddi_get_lbolt() + hz);
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+ }
}
- arc_reclaim_thread_exit = 0;
+ arc_reclaim_thread_exit = FALSE;
cv_broadcast(&arc_reclaim_thread_cv);
CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
spl_fstrans_unmark(cookie);
@@ -3185,12 +3339,11 @@ arc_adapt_thread(void)
static void
arc_user_evicts_thread(void)
{
+ fstrans_cookie_t cookie = spl_fstrans_mark();
callb_cpr_t cpr;
- fstrans_cookie_t cookie;
CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
- cookie = spl_fstrans_mark();
mutex_enter(&arc_user_evicts_lock);
while (!arc_user_evicts_thread_exit) {
mutex_exit(&arc_user_evicts_lock);
@@ -3338,15 +3491,15 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
* reap whatever we can from the various arc slabs.
*/
if (pages > 0) {
- arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
-
+ arc_shrink(ptob(sc->nr_to_scan));
+ arc_kmem_reap_now();
#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
pages = MAX(pages - btop(arc_evictable_memory()), 0);
#else
pages = btop(arc_evictable_memory());
#endif
} else {
- arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+ arc_kmem_reap_now();
pages = SHRINK_STOP;
}
@@ -3421,6 +3574,11 @@ arc_adapt(int bytes, arc_state_t *state)
}
ASSERT((int64_t)arc_p >= 0);
+ if (arc_reclaim_needed()) {
+ cv_signal(&arc_reclaim_thread_cv);
+ return;
+ }
+
if (arc_no_grow)
return;
@@ -4721,7 +4879,11 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
if (zfs_arc_memory_throttle_disable)
return (0);
- if (freemem <= physmem * arc_lotsfree_percent / 100) {
+ if (freemem > physmem * arc_lotsfree_percent / 100)
+ return (0);
+
+ if (arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
return (SET_ERROR(EAGAIN));
@@ -4871,9 +5033,73 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj)
multilist_get_num_sublists(ml));
}
+/*
+ * Called during module initialization and periodically thereafter to
+ * apply reasonable changes to the exposed performance tunings. Non-zero
+ * zfs_* values which differ from the currently set values will be applied.
+ */
+static void
+arc_tuning_update(void)
+{
+ /* Valid range: 64M - <all physical memory> */
+ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
+ (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
+ (zfs_arc_max > arc_c_min)) {
+ arc_c_max = zfs_arc_max;
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+ arc_meta_limit = MIN(arc_meta_limit, arc_c_max);
+ }
+
+ /* Valid range: 32M - <arc_c_max> */
+ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
+ (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
+ (zfs_arc_min <= arc_c_max)) {
+ arc_c_min = zfs_arc_min;
+ arc_c = MAX(arc_c, arc_c_min);
+ }
+
+ /* Valid range: 16M - <arc_c_max> */
+ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
+ (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
+ (zfs_arc_meta_min <= arc_c_max)) {
+ arc_meta_min = zfs_arc_meta_min;
+ arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
+ }
+
+ /* Valid range: <arc_meta_min> - <arc_c_max> */
+ if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
+ (zfs_arc_meta_limit >= zfs_arc_meta_min) &&
+ (zfs_arc_meta_limit <= arc_c_max))
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ /* Valid range: 1 - N */
+ if (zfs_arc_grow_retry)
+ arc_grow_retry = zfs_arc_grow_retry;
+
+ /* Valid range: 1 - N */
+ if (zfs_arc_shrink_shift) {
+ arc_shrink_shift = zfs_arc_shrink_shift;
+ arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
+ }
+
+ /* Valid range: 1 - N ticks */
+ if (zfs_arc_min_prefetch_lifespan)
+ arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+}
+
void
arc_init(void)
{
+ /*
+ * allmem is "all memory that we could possibly use".
+ */
+#ifdef _KERNEL
+ uint64_t allmem = ptob(physmem);
+#else
+ uint64_t allmem = (physmem * PAGESIZE) / 2;
+#endif
+
mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
@@ -4882,10 +5108,10 @@ arc_init(void)
cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
- zfs_arc_min_prefetch_lifespan = 1 * hz;
+ arc_min_prefetch_lifespan = 1 * hz;
/* Start out with 1/8 of all memory */
- arc_c = physmem * PAGESIZE / 8;
+ arc_c = allmem / 8;
#ifdef _KERNEL
/*
@@ -4894,6 +5120,7 @@ arc_init(void)
* need to limit the cache to 1/8 of VM size.
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+
/*
* Register a shrinker to support synchronous (direct) memory
* reclaim from the arc. This is done to prevent kswapd from
@@ -4902,40 +5129,26 @@ arc_init(void)
spl_register_shrinker(&arc_shrinker);
#endif
- /* set min cache to allow safe operation of arc_adapt() */
+ /* Set min cache to allow safe operation of arc_adapt() */
arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
- /* set max to 1/2 of all memory */
- arc_c_max = arc_c * 4;
-
- /*
- * Allow the tunables to override our calculations if they are
- * reasonable (ie. over 64MB)
- */
- if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
- arc_c_max = zfs_arc_max;
- if (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT &&
- zfs_arc_min <= arc_c_max)
- arc_c_min = zfs_arc_min;
+ /* Set max to 1/2 of all memory */
+ arc_c_max = allmem / 2;
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
- /* limit meta-data to 3/4 of the arc capacity */
- arc_meta_limit = (3 * arc_c_max) / 4;
+ /* Set min to 1/2 of arc_c_min */
+ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
+ /* Initialize maximum observed usage to zero */
arc_meta_max = 0;
+ /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
+ arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
- /* Allow the tunable to override if it is reasonable */
- if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
- arc_meta_limit = zfs_arc_meta_limit;
-
- if (zfs_arc_meta_min > 0) {
- arc_meta_min = zfs_arc_meta_min;
- } else {
- arc_meta_min = arc_c_min / 2;
- }
+ /* Apply user specified tunings */
+ arc_tuning_update();
if (zfs_arc_num_sublists_per_state < 1)
- zfs_arc_num_sublists_per_state = num_online_cpus();
+ zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
@@ -5021,7 +5234,7 @@ arc_init(void)
kstat_install(arc_ksp);
}
- (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
+ (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
(void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
@@ -6345,7 +6558,7 @@ l2arc_feed_thread(void)
/*
* Avoid contributing to memory pressure.
*/
- if (arc_no_grow) {
+ if (arc_reclaim_needed()) {
ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
spa_config_exit(spa, SCL_L2ARC, dev);
continue;
@@ -6568,7 +6781,7 @@ MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
module_param(zfs_arc_meta_prune, int, 0644);
MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
-module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
+module_param(zfs_arc_meta_adjust_restarts, int, 0644);
MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
"Limit number of restarts in arc_adjust_meta");