diff options
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/arc.c | 188 | ||||
-rw-r--r-- | module/zfs/zfs_vfsops.c | 21 | ||||
-rw-r--r-- | module/zfs/zfs_znode.c | 2 | ||||
-rw-r--r-- | module/zfs/zpl_super.c | 126 |
4 files changed, 287 insertions, 50 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f71c24844..95d14a9e7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -104,6 +104,14 @@ * protected from simultaneous callbacks from arc_buf_evict() * and arc_do_user_evicts(). * + * It as also possible to register a callback which is run when the + * arc_meta_limit is reached and no buffers can be safely evicted. In + * this case the arc user should drop a reference on some arc buffers so + * they can be reclaimed and the arc_meta_limit honored. For example, + * when using the ZPL each dentry holds a references on a znode. These + * dentries must be pruned before the arc buffer holding the znode can + * be safely evicted. + * * Note that the majority of the performance stats are manipulated * with atomic operations. * @@ -120,14 +128,13 @@ #include <sys/zio.h> #include <sys/zfs_context.h> #include <sys/arc.h> -#include <sys/refcount.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> #include <sys/fs/swapnode.h> -#include <sys/dnlc.h> +#include <sys/zpl.h> #endif #include <sys/callb.h> #include <sys/kstat.h> @@ -141,8 +148,8 @@ extern int zfs_write_limit_shift; extern uint64_t zfs_write_limit_max; extern kmutex_t zfs_write_limit_lock; -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; +/* number of bytes to prune from caches when at arc_meta_limit is reached */ +uint_t arc_meta_prune = 1048576; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ @@ -180,7 +187,7 @@ unsigned long zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_arc_reduce_dnlc_percent = 0; +int zfs_arc_meta_prune = 0; /* * Note that buffers can be in one of 6 states: @@ -288,6 +295,7 @@ typedef struct arc_stats { kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; @@ -352,6 +360,7 @@ static arc_stats_t arc_stats = { { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, + { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, @@ -481,6 +490,8 @@ struct arc_buf_hdr { list_node_t b_l2node; }; +static list_t arc_prune_list; +static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; @@ -1925,6 +1936,48 @@ arc_adjust(void) } } +/* + * Request that arc user drop references so that N bytes can be released + * from the cache. This provides a mechanism to ensure the arc can honor + * the arc_meta_limit and reclaim buffers which are pinned in the cache + * by higher layers. (i.e. the zpl) + */ +static void +arc_do_user_prune(int64_t adjustment) +{ + arc_prune_func_t *func; + void *private; + arc_prune_t *cp, *np; + + mutex_enter(&arc_prune_mtx); + + cp = list_head(&arc_prune_list); + while (cp != NULL) { + func = cp->p_pfunc; + private = cp->p_private; + np = list_next(&arc_prune_list, cp); + refcount_add(&cp->p_refcnt, func); + mutex_exit(&arc_prune_mtx); + + if (func != NULL) + func(adjustment, private); + + mutex_enter(&arc_prune_mtx); + + /* User removed prune callback concurrently with execution */ + if (refcount_remove(&cp->p_refcnt, func) == 0) { + ASSERT(!list_link_active(&cp->p_node)); + refcount_destroy(&cp->p_refcnt); + kmem_free(cp, sizeof (*cp)); + } + + cp = np; + } + + ARCSTAT_BUMP(arcstat_prune); + mutex_exit(&arc_prune_mtx); +} + static void arc_do_user_evicts(void) { @@ -1949,6 +2002,32 @@ arc_do_user_evicts(void) } /* + * Evict only meta data objects from the cache leaving the data objects. + * This is only used to enforce the tunable arc_meta_limit, if we are + * unable to evict enough buffers notify the user via the prune callback. + */ +void +arc_adjust_meta(int64_t adjustment, boolean_t may_prune) +{ + int64_t delta; + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); + arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); + adjustment -= delta; + } + + if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) + arc_do_user_prune(arc_meta_prune); +} + +/* * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ @@ -2085,24 +2164,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; -#ifdef _KERNEL - int retry = 0; - - while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) { - /* - * We are exceeding our meta-data cache limit. - * Purge some DNLC entries to release holds on meta-data. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - retry++; - } -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif -#endif /* * An aggressive reclamation will shrink the cache size as well as @@ -2121,6 +2182,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_reap_now(zio_data_buf_cache[i]); } } + kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); } @@ -2131,6 +2193,7 @@ arc_reclaim_thread(void) clock_t growtime = 0; arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; + int64_t prune; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); @@ -2160,9 +2223,14 @@ arc_reclaim_thread(void) arc_no_grow = FALSE; } - /* Keep meta data usage within limits */ - if (arc_meta_used >= arc_meta_limit) - arc_kmem_reap_now(ARC_RECLAIM_CONS); + /* + * Keep meta data usage within limits, arc_shrink() is not + * used to avoid collapsing the arc_c value when only the + * arc_meta_limit is being exceeded. + */ + prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; + if (prune > 0) + arc_adjust_meta(prune, B_TRUE); arc_adjust(); @@ -2399,16 +2467,27 @@ arc_get_data_buf(arc_buf_t *buf) state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } + if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); + + /* + * If we are unable to recycle an existing meta buffer + * signal the reclaim thread. It will notify users + * via the prune callback to drop references. The + * prune callback in run in the context of the reclaim + * thread to avoid deadlocking on the hash_lock. + */ + cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } + ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -3021,6 +3100,37 @@ top: return (0); } +arc_prune_t * +arc_add_prune_callback(arc_prune_func_t *func, void *private) +{ + arc_prune_t *p; + + p = kmem_alloc(sizeof(*p), KM_SLEEP); + p->p_pfunc = func; + p->p_private = private; + list_link_init(&p->p_node); + refcount_create(&p->p_refcnt); + + mutex_enter(&arc_prune_mtx); + refcount_add(&p->p_refcnt, &arc_prune_list); + list_insert_head(&arc_prune_list, p); + mutex_exit(&arc_prune_mtx); + + return (p); +} + +void +arc_remove_prune_callback(arc_prune_t *p) +{ + mutex_enter(&arc_prune_mtx); + list_remove(&arc_prune_list, p); + if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) { + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); +} + void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { @@ -3598,8 +3708,8 @@ arc_init(void) if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; - if (zfs_arc_reduce_dnlc_percent > 0) - arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent; + if (zfs_arc_meta_prune > 0) + arc_meta_prune = zfs_arc_meta_prune; /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) @@ -3646,7 +3756,10 @@ arc_init(void) buf_init(); arc_thread_exit = 0; + list_create(&arc_prune_list, sizeof (arc_prune_t), + offsetof(arc_prune_t, p_node)); arc_eviction_list = NULL; + mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); @@ -3674,6 +3787,8 @@ arc_init(void) void arc_fini(void) { + arc_prune_t *p; + mutex_enter(&arc_reclaim_thr_lock); #ifdef _KERNEL spl_unregister_shrinker(&arc_shrinker); @@ -3693,6 +3808,17 @@ arc_fini(void) arc_ksp = NULL; } + mutex_enter(&arc_prune_mtx); + while ((p = list_head(&arc_prune_list)) != NULL) { + list_remove(&arc_prune_list, p); + refcount_remove(&p->p_refcnt, &arc_prune_list); + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); + + list_destroy(&arc_prune_list); + mutex_destroy(&arc_prune_mtx); mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); @@ -4774,6 +4900,8 @@ l2arc_stop(void) EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_remove_ref); EXPORT_SYMBOL(arc_getbuf_func); +EXPORT_SYMBOL(arc_add_prune_callback); +EXPORT_SYMBOL(arc_remove_prune_callback); module_param(zfs_arc_min, ulong, 0444); MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); @@ -4784,8 +4912,8 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); module_param(zfs_arc_meta_limit, ulong, 0444); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); -module_param(zfs_arc_reduce_dnlc_percent, int, 0444); -MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage"); +module_param(zfs_arc_meta_prune, int, 0444); +MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); module_param(zfs_arc_grow_retry, int, 0444); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index a0726e117..fb319a547 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -986,6 +986,26 @@ zfs_root(zfs_sb_t *zsb, struct inode **ipp) } EXPORT_SYMBOL(zfs_root); +#ifdef HAVE_SHRINK +int +zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) +{ + zfs_sb_t *zsb = sb->s_fs_info; + struct shrinker *shrinker = &sb->s_shrink; + struct shrink_control sc = { + .nr_to_scan = nr_to_scan, + .gfp_mask = GFP_KERNEL, + }; + + ZFS_ENTER(zsb); + *objects = (*shrinker->shrink)(shrinker, &sc); + ZFS_EXIT(zsb); + + return (0); +} +EXPORT_SYMBOL(zfs_sb_prune); +#endif /* HAVE_SHRINK */ + /* * Teardown the zfs_sb_t::z_os. * @@ -1533,6 +1553,7 @@ zfs_init(void) zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); register_filesystem(&zpl_fs_type); + (void) arc_add_prune_callback(zpl_prune_sbs, NULL); } void diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index a35e3b5f2..709ae74f8 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -269,6 +269,7 @@ zfs_inode_destroy(struct inode *ip) mutex_enter(&zsb->z_znodes_lock); list_remove(&zsb->z_all_znodes, zp); + zsb->z_nr_znodes--; mutex_exit(&zsb->z_znodes_lock); if (zp->z_acl_cached) { @@ -401,6 +402,7 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, mutex_enter(&zsb->z_znodes_lock); list_insert_tail(&zsb->z_all_znodes, zp); + zsb->z_nr_znodes++; membar_producer(); mutex_exit(&zsb->z_znodes_lock); diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c index 650e9c0d3..3abb26a9e 100644 --- a/module/zfs/zpl_super.c +++ b/module/zfs/zpl_super.c @@ -199,34 +199,120 @@ zpl_kill_sb(struct super_block *sb) kill_anon_super(sb); } +#ifdef HAVE_SHRINK +/* + * Linux 3.1 - 3.x API + * + * The Linux 3.1 API introduced per-sb cache shrinkers to replace the + * global ones. This allows us a mechanism to cleanly target a specific + * zfs file system when the dnode and inode caches grow too large. + * + * In addition, the 3.0 kernel added the iterate_supers_type() helper + * function which is used to safely walk all of the zfs file systems. + */ +static void +zpl_prune_sb(struct super_block *sb, void *arg) +{ + int objects = 0; + int error; + + error = -zfs_sb_prune(sb, *(unsigned long *)arg, &objects); + ASSERT3S(error, <=, 0); + + return; +} + +void +zpl_prune_sbs(int64_t bytes_to_scan, void *private) +{ + unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); + + iterate_supers_type(&zpl_fs_type, zpl_prune_sb, &nr_to_scan); + kmem_reap(); +} +#else +/* + * Linux 2.6.x - 3.0 API + * + * These are best effort interfaces are provided by the SPL to induce + * the Linux VM subsystem to reclaim a fraction of the both dnode and + * inode caches. Ideally, we want to just target the zfs file systems + * however our only option is to reclaim from them all. + */ +void +zpl_prune_sbs(int64_t bytes_to_scan, void *private) +{ + unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); + + shrink_dcache_memory(nr_to_scan, GFP_KERNEL); + shrink_icache_memory(nr_to_scan, GFP_KERNEL); + kmem_reap(); +} +#endif /* HAVE_SHRINK */ + +#ifdef HAVE_NR_CACHED_OBJECTS +static int +zpl_nr_cached_objects(struct super_block *sb) +{ + zfs_sb_t *zsb = sb->s_fs_info; + int nr; + + mutex_enter(&zsb->z_znodes_lock); + nr = zsb->z_nr_znodes; + mutex_exit(&zsb->z_znodes_lock); + + return (nr); +} +#endif /* HAVE_NR_CACHED_OBJECTS */ + +#ifdef HAVE_FREE_CACHED_OBJECTS +/* + * Attempt to evict some meta data from the cache. The ARC operates in + * terms of bytes while the Linux VFS uses objects. Now because this is + * just a best effort eviction and the exact values aren't critical so we + * extrapolate from an object count to a byte size using the znode_t size. + */ +static void +zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) +{ + arc_adjust_meta(nr_to_scan * sizeof(znode_t), B_FALSE); +} +#endif /* HAVE_FREE_CACHED_OBJECTS */ + const struct super_operations zpl_super_operations = { - .alloc_inode = zpl_inode_alloc, - .destroy_inode = zpl_inode_destroy, - .dirty_inode = NULL, - .write_inode = NULL, - .drop_inode = NULL, + .alloc_inode = zpl_inode_alloc, + .destroy_inode = zpl_inode_destroy, + .dirty_inode = NULL, + .write_inode = NULL, + .drop_inode = NULL, #ifdef HAVE_EVICT_INODE - .evict_inode = zpl_evict_inode, + .evict_inode = zpl_evict_inode, #else - .clear_inode = zpl_clear_inode, - .delete_inode = zpl_inode_delete, + .clear_inode = zpl_clear_inode, + .delete_inode = zpl_inode_delete, #endif /* HAVE_EVICT_INODE */ - .put_super = zpl_put_super, - .write_super = NULL, - .sync_fs = zpl_sync_fs, - .statfs = zpl_statfs, - .remount_fs = zpl_remount_fs, - .show_options = zpl_show_options, - .show_stats = NULL, + .put_super = zpl_put_super, + .write_super = NULL, + .sync_fs = zpl_sync_fs, + .statfs = zpl_statfs, + .remount_fs = zpl_remount_fs, + .show_options = zpl_show_options, + .show_stats = NULL, +#ifdef HAVE_NR_CACHED_OBJECTS + .nr_cached_objects = zpl_nr_cached_objects, +#endif /* HAVE_NR_CACHED_OBJECTS */ +#ifdef HAVE_FREE_CACHED_OBJECTS + .free_cached_objects = zpl_free_cached_objects, +#endif /* HAVE_FREE_CACHED_OBJECTS */ }; struct file_system_type zpl_fs_type = { - .owner = THIS_MODULE, - .name = ZFS_DRIVER, + .owner = THIS_MODULE, + .name = ZFS_DRIVER, #ifdef HAVE_MOUNT_NODEV - .mount = zpl_mount, + .mount = zpl_mount, #else - .get_sb = zpl_get_sb, + .get_sb = zpl_get_sb, #endif /* HAVE_MOUNT_NODEV */ - .kill_sb = zpl_kill_sb, + .kill_sb = zpl_kill_sb, }; |