4 files changed, 287 insertions, 50 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f71c24844..95d14a9e7 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -104,6 +104,14 @@
  * protected from simultaneous callbacks from arc_buf_evict()
  * and arc_do_user_evicts().
  *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
@@ -120,14 +128,13 @@
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
-#include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
 #include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
@@ -141,8 +148,8 @@ extern int zfs_write_limit_shift;
 extern uint64_t zfs_write_limit_max;
 extern kmutex_t zfs_write_limit_lock;
 
-#define	ARC_REDUCE_DNLC_PERCENT	3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
 
 typedef enum arc_reclaim_strategy {
 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
@@ -180,7 +187,7 @@ unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
-int zfs_arc_reduce_dnlc_percent = 0;
+int zfs_arc_meta_prune = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -288,6 +295,7 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_no_grow;
 	kstat_named_t arcstat_tempreserve;
 	kstat_named_t arcstat_loaned_bytes;
+	kstat_named_t arcstat_prune;
 	kstat_named_t arcstat_meta_used;
 	kstat_named_t arcstat_meta_limit;
 	kstat_named_t arcstat_meta_max;
@@ -352,6 +360,7 @@ static arc_stats_t arc_stats = {
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
+	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
@@ -481,6 +490,8 @@ struct arc_buf_hdr {
 	list_node_t		b_l2node;
 };
 
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
 static arc_buf_t *arc_eviction_list;
 static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
@@ -1925,6 +1936,48 @@ arc_adjust(void)
 	}
 }
 
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache.  This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers.  (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+	arc_prune_func_t *func;
+	void *private;
+	arc_prune_t *cp, *np;
+
+	mutex_enter(&arc_prune_mtx);
+
+	cp = list_head(&arc_prune_list);
+	while (cp != NULL) {
+		func = cp->p_pfunc;
+		private = cp->p_private;
+		np = list_next(&arc_prune_list, cp);
+		refcount_add(&cp->p_refcnt, func);
+		mutex_exit(&arc_prune_mtx);
+
+		if (func != NULL)
+			func(adjustment, private);
+
+		mutex_enter(&arc_prune_mtx);
+
+		/* User removed prune callback concurrently with execution */
+		if (refcount_remove(&cp->p_refcnt, func) == 0) {
+			ASSERT(!list_link_active(&cp->p_node));
+			refcount_destroy(&cp->p_refcnt);
+			kmem_free(cp, sizeof (*cp));
+		}
+
+		cp = np;
+	}
+
+	ARCSTAT_BUMP(arcstat_prune);
+	mutex_exit(&arc_prune_mtx);
+}
+
 static void
 arc_do_user_evicts(void)
 {
@@ -1949,6 +2002,32 @@ arc_do_user_evicts(void)
 }
 
 /*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+	int64_t delta;
+
+	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+		arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+		adjustment -= delta;
+	}
+
+	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+		delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+		arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+		adjustment -= delta;
+	}
+
+	if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+		arc_do_user_prune(arc_meta_prune);
+}
+
+/*
  * Flush all *evictable* data from the cache for the given spa.
  * NOTE: this will not touch "active" (i.e. referenced) data.
  */
@@ -2085,24 +2164,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
-#ifdef _KERNEL
-	int			retry = 0;
-
-	while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
-		/*
-		 * We are exceeding our meta-data cache limit.
-		 * Purge some DNLC entries to release holds on meta-data.
-		 */
-		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-		retry++;
-	}
-#if defined(__i386)
-	/*
-	 * Reclaim unused memory from all kmem caches.
-	 */
-	kmem_reap();
-#endif
-#endif
 
 	/*
 	 * An aggressive reclamation will shrink the cache size as well as
@@ -2121,6 +2182,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
+
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_cache);
 }
@@ -2131,6 +2193,7 @@ arc_reclaim_thread(void)
 	clock_t			growtime = 0;
 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
 	callb_cpr_t		cpr;
+	int64_t			prune;
 
 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
 
@@ -2160,9 +2223,14 @@ arc_reclaim_thread(void)
 			arc_no_grow = FALSE;
 		}
 
-		/* Keep meta data usage within limits */
-		if (arc_meta_used >= arc_meta_limit)
-			arc_kmem_reap_now(ARC_RECLAIM_CONS);
+		/*
+		 * Keep meta data usage within limits, arc_shrink() is not
+		 * used to avoid collapsing the arc_c value when only the
+		 * arc_meta_limit is being exceeded.
+		 */
+		prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+		if (prune > 0)
+			arc_adjust_meta(prune, B_TRUE);
 
 		arc_adjust();
 
@@ -2399,16 +2467,27 @@ arc_get_data_buf(arc_buf_t *buf)
 		state =  (arc_mru->arcs_lsize[type] >= size &&
 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
+
 	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
 			arc_space_consume(size, ARC_SPACE_DATA);
+
+			/*
+			 * If we are unable to recycle an existing meta buffer
+			 * signal the reclaim thread.  It will notify users
+			 * via the prune callback to drop references.  The
+			 * prune callback in run in the context of the reclaim
+			 * thread to avoid deadlocking on the hash_lock.
+			 */
+			cv_signal(&arc_reclaim_thr_cv);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
 			ARCSTAT_INCR(arcstat_data_size, size);
 			atomic_add_64(&arc_size, size);
 		}
+
 		ARCSTAT_BUMP(arcstat_recycle_miss);
 	}
 	ASSERT(buf->b_data != NULL);
@@ -3021,6 +3100,37 @@ top:
 	return (0);
 }
 
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+	arc_prune_t *p;
+
+	p = kmem_alloc(sizeof(*p), KM_SLEEP);
+	p->p_pfunc = func;
+	p->p_private = private;
+	list_link_init(&p->p_node);
+	refcount_create(&p->p_refcnt);
+
+	mutex_enter(&arc_prune_mtx);
+	refcount_add(&p->p_refcnt, &arc_prune_list);
+	list_insert_head(&arc_prune_list, p);
+	mutex_exit(&arc_prune_mtx);
+
+	return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+	mutex_enter(&arc_prune_mtx);
+	list_remove(&arc_prune_list, p);
+	if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+		refcount_destroy(&p->p_refcnt);
+		kmem_free(p, sizeof (*p));
+	}
+	mutex_exit(&arc_prune_mtx);
+}
+
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
@@ -3598,8 +3708,8 @@ arc_init(void)
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
-	if (zfs_arc_reduce_dnlc_percent > 0)
-		arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+	if (zfs_arc_meta_prune > 0)
+		arc_meta_prune = zfs_arc_meta_prune;
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
@@ -3646,7 +3756,10 @@ arc_init(void)
 	buf_init();
 
 	arc_thread_exit = 0;
+	list_create(&arc_prune_list, sizeof (arc_prune_t),
+	    offsetof(arc_prune_t, p_node));
 	arc_eviction_list = NULL;
+	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
 
@@ -3674,6 +3787,8 @@ arc_init(void)
 void
 arc_fini(void)
 {
+	arc_prune_t *p;
+
 	mutex_enter(&arc_reclaim_thr_lock);
 #ifdef _KERNEL
 	spl_unregister_shrinker(&arc_shrinker);
@@ -3693,6 +3808,17 @@ arc_fini(void)
 		arc_ksp = NULL;
 	}
 
+	mutex_enter(&arc_prune_mtx);
+	while ((p = list_head(&arc_prune_list)) != NULL) {
+		list_remove(&arc_prune_list, p);
+		refcount_remove(&p->p_refcnt, &arc_prune_list);
+		refcount_destroy(&p->p_refcnt);
+		kmem_free(p, sizeof (*p));
+	}
+	mutex_exit(&arc_prune_mtx);
+
+	list_destroy(&arc_prune_list);
+	mutex_destroy(&arc_prune_mtx);
 	mutex_destroy(&arc_eviction_mtx);
 	mutex_destroy(&arc_reclaim_thr_lock);
 	cv_destroy(&arc_reclaim_thr_cv);
@@ -4774,6 +4900,8 @@ l2arc_stop(void)
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_remove_ref);
 EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
 
 module_param(zfs_arc_min, ulong, 0444);
 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
@@ -4784,8 +4912,8 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
 module_param(zfs_arc_meta_limit, ulong, 0444);
 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 
-module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
-MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
 
 module_param(zfs_arc_grow_retry, int, 0444);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index a0726e117..fb319a547 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -986,6 +986,26 @@ zfs_root(zfs_sb_t *zsb, struct inode **ipp)
 }
 EXPORT_SYMBOL(zfs_root);
 
+#ifdef HAVE_SHRINK
+int
+zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+	struct shrinker *shrinker = &sb->s_shrink;
+	struct shrink_control sc = {
+		.nr_to_scan = nr_to_scan,
+		.gfp_mask = GFP_KERNEL,
+	};
+
+	ZFS_ENTER(zsb);
+	*objects = (*shrinker->shrink)(shrinker, &sc);
+	ZFS_EXIT(zsb);
+
+	return (0);
+}
+EXPORT_SYMBOL(zfs_sb_prune);
+#endif /* HAVE_SHRINK */
+
 /*
  * Teardown the zfs_sb_t::z_os.
  *
@@ -1533,6 +1553,7 @@ zfs_init(void)
 	zfs_znode_init();
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 	register_filesystem(&zpl_fs_type);
+	(void) arc_add_prune_callback(zpl_prune_sbs, NULL);
 }
 
 void
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index a35e3b5f2..709ae74f8 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -269,6 +269,7 @@ zfs_inode_destroy(struct inode *ip)
 
 	mutex_enter(&zsb->z_znodes_lock);
 	list_remove(&zsb->z_all_znodes, zp);
+	zsb->z_nr_znodes--;
 	mutex_exit(&zsb->z_znodes_lock);
 
 	if (zp->z_acl_cached) {
@@ -401,6 +402,7 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
 
 	mutex_enter(&zsb->z_znodes_lock);
 	list_insert_tail(&zsb->z_all_znodes, zp);
+	zsb->z_nr_znodes++;
 	membar_producer();
 	mutex_exit(&zsb->z_znodes_lock);
 
diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
index 650e9c0d3..3abb26a9e 100644
--- a/module/zfs/zpl_super.c
+++ b/module/zfs/zpl_super.c
@@ -199,34 +199,120 @@ zpl_kill_sb(struct super_block *sb)
 	kill_anon_super(sb);
 }
 
+#ifdef HAVE_SHRINK
+/*
+ * Linux 3.1 - 3.x API
+ *
+ * The Linux 3.1 API introduced per-sb cache shrinkers to replace the
+ * global ones.  This allows us a mechanism to cleanly target a specific
+ * zfs file system when the dnode and inode caches grow too large.
+ *
+ * In addition, the 3.0 kernel added the iterate_supers_type() helper
+ * function which is used to safely walk all of the zfs file systems.
+ */
+static void
+zpl_prune_sb(struct super_block *sb, void *arg)
+{
+	int objects = 0;
+	int error;
+
+	error = -zfs_sb_prune(sb, *(unsigned long *)arg, &objects);
+	ASSERT3S(error, <=, 0);
+
+	return;
+}
+
+void
+zpl_prune_sbs(int64_t bytes_to_scan, void *private)
+{
+	unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t));
+
+	iterate_supers_type(&zpl_fs_type, zpl_prune_sb, &nr_to_scan);
+	kmem_reap();
+}
+#else
+/*
+ * Linux 2.6.x - 3.0 API
+ *
+ * These are best effort interfaces are provided by the SPL to induce
+ * the Linux VM subsystem to reclaim a fraction of the both dnode and
+ * inode caches.  Ideally, we want to just target the zfs file systems
+ * however our only option is to reclaim from them all.
+ */
+void
+zpl_prune_sbs(int64_t bytes_to_scan, void *private)
+{
+	unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t));
+
+        shrink_dcache_memory(nr_to_scan, GFP_KERNEL);
+        shrink_icache_memory(nr_to_scan, GFP_KERNEL);
+        kmem_reap();
+}
+#endif /* HAVE_SHRINK */
+
+#ifdef HAVE_NR_CACHED_OBJECTS
+static int
+zpl_nr_cached_objects(struct super_block *sb)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+	int nr;
+
+	mutex_enter(&zsb->z_znodes_lock);
+	nr = zsb->z_nr_znodes;
+	mutex_exit(&zsb->z_znodes_lock);
+
+	return (nr);
+}
+#endif /* HAVE_NR_CACHED_OBJECTS */
+
+#ifdef HAVE_FREE_CACHED_OBJECTS
+/*
+ * Attempt to evict some meta data from the cache.  The ARC operates in
+ * terms of bytes while the Linux VFS uses objects.  Now because this is
+ * just a best effort eviction and the exact values aren't critical so we
+ * extrapolate from an object count to a byte size using the znode_t size.
+ */
+static void
+zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
+{
+	arc_adjust_meta(nr_to_scan * sizeof(znode_t), B_FALSE);
+}
+#endif /* HAVE_FREE_CACHED_OBJECTS */
+
 const struct super_operations zpl_super_operations = {
-	.alloc_inode	= zpl_inode_alloc,
-	.destroy_inode	= zpl_inode_destroy,
-	.dirty_inode	= NULL,
-	.write_inode	= NULL,
-	.drop_inode	= NULL,
+	.alloc_inode		= zpl_inode_alloc,
+	.destroy_inode		= zpl_inode_destroy,
+	.dirty_inode		= NULL,
+	.write_inode		= NULL,
+	.drop_inode		= NULL,
 #ifdef HAVE_EVICT_INODE
-	.evict_inode	= zpl_evict_inode,
+	.evict_inode		= zpl_evict_inode,
 #else
-	.clear_inode	= zpl_clear_inode,
-	.delete_inode	= zpl_inode_delete,
+	.clear_inode		= zpl_clear_inode,
+	.delete_inode		= zpl_inode_delete,
 #endif /* HAVE_EVICT_INODE */
-	.put_super	= zpl_put_super,
-	.write_super	= NULL,
-	.sync_fs	= zpl_sync_fs,
-	.statfs		= zpl_statfs,
-	.remount_fs	= zpl_remount_fs,
-	.show_options	= zpl_show_options,
-	.show_stats	= NULL,
+	.put_super		= zpl_put_super,
+	.write_super		= NULL,
+	.sync_fs		= zpl_sync_fs,
+	.statfs			= zpl_statfs,
+	.remount_fs		= zpl_remount_fs,
+	.show_options		= zpl_show_options,
+	.show_stats		= NULL,
+#ifdef HAVE_NR_CACHED_OBJECTS
+	.nr_cached_objects	= zpl_nr_cached_objects,
+#endif /* HAVE_NR_CACHED_OBJECTS */
+#ifdef HAVE_FREE_CACHED_OBJECTS
+	.free_cached_objects	= zpl_free_cached_objects,
+#endif /* HAVE_FREE_CACHED_OBJECTS */
 };
 
 struct file_system_type zpl_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= ZFS_DRIVER,
+	.owner			= THIS_MODULE,
+	.name			= ZFS_DRIVER,
 #ifdef HAVE_MOUNT_NODEV
-	.mount		= zpl_mount,
+	.mount			= zpl_mount,
 #else
-	.get_sb		= zpl_get_sb,
+	.get_sb			= zpl_get_sb,
 #endif /* HAVE_MOUNT_NODEV */
-	.kill_sb	= zpl_kill_sb,
+	.kill_sb		= zpl_kill_sb,
 };