diff options
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r-- | module/zfs/arc.c | 227 |
1 files changed, 149 insertions, 78 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index cbe0a6028..ccc9510fd 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -172,8 +172,11 @@ int arc_evict_iterations = 100; /* number of seconds before growing cache again */ int zfs_arc_grow_retry = 5; -/* shift of arc_c for calculating both min and max arc_p */ -int zfs_arc_p_min_shift = 4; +/* disable anon data aggressively growing arc_p */ +int zfs_arc_p_aggressive_disable = 1; + +/* disable arc_p adapt dampener in arc_adapt */ +int zfs_arc_p_dampener_disable = 1; /* log2(fraction of arc to reclaim) */ int zfs_arc_shrink_shift = 5; @@ -305,6 +308,7 @@ typedef struct arc_stats { kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; + kstat_named_t arcstat_meta_size; kstat_named_t arcstat_other_size; kstat_named_t arcstat_anon_size; kstat_named_t arcstat_anon_evict_data; @@ -392,6 +396,7 @@ static arc_stats_t arc_stats = { { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, + { "meta_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evict_data", KSTAT_DATA_UINT64 }, @@ -1364,6 +1369,9 @@ arc_space_consume(uint64_t space, arc_space_type_t type) case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; + case ARC_SPACE_META: + ARCSTAT_INCR(arcstat_meta_size, space); + break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; @@ -1375,7 +1383,9 @@ arc_space_consume(uint64_t space, arc_space_type_t type) break; } - ARCSTAT_INCR(arcstat_meta_used, space); + if (type != ARC_SPACE_DATA) + ARCSTAT_INCR(arcstat_meta_used, space); + atomic_add_64(&arc_size, space); } @@ -1390,6 +1400,9 @@ arc_space_return(uint64_t space, arc_space_type_t type) case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; + case ARC_SPACE_META: + ARCSTAT_INCR(arcstat_meta_size, -space); + break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; @@ -1401,10 +1414,13 @@ arc_space_return(uint64_t space, arc_space_type_t type) break; } - ASSERT(arc_meta_used >= space); - if (arc_meta_max < arc_meta_used) - arc_meta_max = arc_meta_used; - ARCSTAT_INCR(arcstat_meta_used, -space); + if (type != ARC_SPACE_DATA) { + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + ARCSTAT_INCR(arcstat_meta_used, -space); + } + ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } @@ -1601,12 +1617,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_DATA); + arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf, zio_data_buf_free); - ARCSTAT_INCR(arcstat_data_size, -size); - atomic_add_64(&arc_size, -size); + arc_space_return(size, ARC_SPACE_DATA); } } if (list_link_active(&buf->b_hdr->b_arc_node)) { @@ -1887,6 +1902,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; +top: mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); @@ -2002,6 +2018,15 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_evicted < bytes)) { + /* Prevent second pass from recycling metadata into data */ + recycle = FALSE; + type = ARC_BUFC_METADATA; + list = &state->arcs_list[type]; + goto top; + } + if (bytes_evicted < bytes) dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state); @@ -2141,19 +2166,11 @@ arc_adjust(void) */ adjustment = MIN((int64_t)(arc_size - arc_c), - (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - - arc_p)); + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + if (adjustment > 0 && arc_mru->arcs_size > 0) { + delta = MIN(arc_mru->arcs_size, adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } - - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, - ARC_BUFC_METADATA); } /* @@ -2162,17 +2179,9 @@ arc_adjust(void) adjustment = arc_size - arc_c; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + if (adjustment > 0 && arc_mfu->arcs_size > 0) { + delta = MIN(arc_mfu->arcs_size, adjustment); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } - - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t delta = MIN(adjustment, - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, - ARC_BUFC_METADATA); } /* @@ -2265,24 +2274,61 @@ arc_do_user_evicts(void) * This is only used to enforce the tunable arc_meta_limit, if we are * unable to evict enough buffers notify the user via the prune callback. */ -void -arc_adjust_meta(int64_t adjustment, boolean_t may_prune) +static void +arc_adjust_meta(void) { - int64_t delta; + int64_t adjustmnt, delta; - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + /* + * This slightly differs than the way we evict from the mru in + * arc_adjust because we don't have a "target" value (i.e. no + * "meta" arc_p). As a result, I think we can completely + * cannibalize the metadata in the MRU before we evict the + * metadata from the MFU. I think we probably need to implement a + * "metadata arc_p" value to do this properly. + */ + adjustmnt = arc_meta_used - arc_meta_limit; + + if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); - adjustment -= delta; + adjustmnt -= delta; } - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); + /* + * We can't afford to recalculate adjustmnt here. If we do, + * new metadata buffers can sneak into the MRU or ANON lists, + * thus penalize the MFU metadata. Although the fudge factor is + * small, it has been empirically shown to be significant for + * certain workloads (e.g. creating many empty directories). As + * such, we use the original calculation for adjustmnt, and + * simply decrement the amount of data evicted from the MRU. + */ + + if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); - adjustment -= delta; } - if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) + adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; + + if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(adjustmnt, + arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); + arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA); + } + + adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; + + if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(adjustmnt, + arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); + arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA); + } + + if (arc_meta_used > arc_meta_limit) arc_do_user_prune(zfs_arc_meta_prune); } @@ -2341,7 +2387,13 @@ arc_shrink(uint64_t bytes) else arc_c = arc_c_min; - atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift)); + to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift; + + if (arc_p > to_free) + atomic_add_64(&arc_p, -to_free); + else + arc_p = 0; + if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) @@ -2396,7 +2448,6 @@ static void arc_adapt_thread(void) { callb_cpr_t cpr; - int64_t prune; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); @@ -2432,14 +2483,7 @@ arc_adapt_thread(void) if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time) arc_no_grow = FALSE; - /* - * Keep meta data usage within limits, arc_shrink() is not - * used to avoid collapsing the arc_c value when only the - * arc_meta_limit is being exceeded. - */ - prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; - if (prune > 0) - arc_adjust_meta(prune, B_TRUE); + arc_adjust_meta(); arc_adjust(); @@ -2574,8 +2618,10 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) */ if (pages > 0) { arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); + pages = btop(arc_evictable_memory()); } else { arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); + pages = -1; } /* @@ -2595,7 +2641,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) mutex_exit(&arc_reclaim_thr_lock); - return (-1); + return (pages); } SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); @@ -2611,7 +2657,6 @@ static void arc_adapt(int bytes, arc_state_t *state) { int mult; - uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift); if (state == arc_l2c_only) return; @@ -2628,18 +2673,22 @@ arc_adapt(int bytes, arc_state_t *state) if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); + if (!zfs_arc_p_dampener_disable) + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ + + arc_p = MIN(arc_c, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); - mult = MIN(mult, 10); + + if (!zfs_arc_p_dampener_disable) + mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); + arc_p = MAX(0, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); @@ -2710,6 +2759,8 @@ arc_get_data_buf(arc_buf_t *buf) arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t evict = ARC_BUFC_DATA; + boolean_t recycle = TRUE; arc_adapt(size, state); @@ -2720,12 +2771,11 @@ arc_get_data_buf(arc_buf_t *buf) if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); + arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); - ARCSTAT_INCR(arcstat_data_size, size); - atomic_add_64(&arc_size, size); + arc_space_consume(size, ARC_SPACE_DATA); } goto out; } @@ -2750,10 +2800,27 @@ arc_get_data_buf(arc_buf_t *buf) mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { + /* + * Evict data buffers prior to metadata buffers, unless we're + * over the metadata limit and adding a metadata buffer. + */ + if (type == ARC_BUFC_METADATA) { + if (arc_meta_used >= arc_meta_limit) + evict = ARC_BUFC_METADATA; + else + /* + * In this case, we're evicting data while + * adding metadata. Thus, to prevent recycling a + * data buffer into a metadata buffer, recycling + * is disabled in the following arc_evict call. + */ + recycle = FALSE; + } + + if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); + arc_space_consume(size, ARC_SPACE_META); /* * If we are unable to recycle an existing meta buffer @@ -2761,16 +2828,19 @@ arc_get_data_buf(arc_buf_t *buf) * via the prune callback to drop references. The * prune callback in run in the context of the reclaim * thread to avoid deadlocking on the hash_lock. + * Of course, only do this when recycle is true. */ - cv_signal(&arc_reclaim_thr_cv); + if (recycle) + cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); - ARCSTAT_INCR(arcstat_data_size, size); - atomic_add_64(&arc_size, size); + arc_space_consume(size, ARC_SPACE_DATA); } - ARCSTAT_BUMP(arcstat_recycle_miss); + /* Only bump this if we tried to recycle and failed */ + if (recycle) + ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out: @@ -2790,7 +2860,8 @@ out: * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (arc_size < arc_c && hdr->b_state == arc_anon && + if (!zfs_arc_p_aggressive_disable && + arc_size < arc_c && hdr->b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -4025,8 +4096,8 @@ arc_init(void) spl_register_shrinker(&arc_shrinker); #endif - /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ - arc_c_min = MAX(arc_c / 4, 64<<20); + /* set min cache to zero */ + arc_c_min = 4<<20; /* set max to 1/2 of all memory */ arc_c_max = arc_c * 4; @@ -4036,23 +4107,20 @@ arc_init(void) */ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) arc_c_max = zfs_arc_max; - if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; arc_c = arc_c_max; arc_p = (arc_c >> 1); - /* limit meta-data to 1/4 of the arc capacity */ - arc_meta_limit = arc_c_max / 4; + /* limit meta-data to 3/4 of the arc capacity */ + arc_meta_limit = (3 * arc_c_max) / 4; arc_meta_max = 0; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; - if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) - arc_c_min = arc_meta_limit / 2; - /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -5548,12 +5616,15 @@ MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); +module_param(zfs_arc_p_aggressive_disable, int, 0644); +MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow"); + +module_param(zfs_arc_p_dampener_disable, int, 0644); +MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener"); + module_param(zfs_arc_shrink_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); -module_param(zfs_arc_p_min_shift, int, 0644); -MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); - module_param(zfs_disable_dup_eviction, int, 0644); MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction"); |