kmem_cache hardening and performance improvements

- Added slab work queue task which gradually ages and free's slabs from the cache which have not been used recently. - Optimized slab packing algorithm to ensure each slab contains the maximum number of objects without create to large a slab. - Fix deadlock, we can never call kv_free() under the skc_lock. We now unlink the objects and slabs from the cache itself and attach them to a private work list. The contents of the list are then subsequently freed outside the spin lock. - Move magazine create/destroy operation on to local cpu. - Further performace optimizations by minimize the usage of the large per-cache skc_lock. This includes the addition of KMC_BIT_REAPING bit mask which is used to prevent concurrent reaping, and to defer new slab creation when reaping is occuring. - Add KMC_BIT_DESTROYING bit mask which is set when the cache is being destroyed, this is used to catch any task accessing the cache while it is being destroyed. - Add comments to all the functions and additional comments to try and make everything as clear as possible. - Major cleanup and additions to the SPLAT kmem tests to more rigerously stress the cache implementation and look for any problems. This includes correctness and performance tests. - Updated portable work queue interfaces
author: Brian Behlendorf <[email protected]> 2009-01-30 20:54:49 -0800
committer: Brian Behlendorf <[email protected]> 2009-01-30 20:54:49 -0800
commit: ea3e6ca9e595ebfba82b964ee2eaf1ddd7076f0f (patch)
tree: 7480b87145297f3882ffe18234280512e136cdb4 /module
parent: 34e71c9e97f4d0d2b3ede850d016a7de558b0f3c (diff)
3 files changed, 977 insertions, 456 deletions
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index a68f8efe9..83eefe293 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -132,10 +132,6 @@ EXPORT_SYMBOL(kmem_set_warning);
  * small virtual address space on 32bit arches.  This will seriously
  * constrain the size of the slab caches and their performance.
  *
- * XXX: Implement work requests to keep an eye on each cache and
- *      shrink them via spl_slab_reclaim() when they are wasting lots
- *      of space.  Currently this process is driven by the reapers.
- *
  * XXX: Improve the partial slab list by carefully maintaining a
  *      strict ordering of fullest to emptiest slabs based on
  *      the slab reference count.  This gaurentees the when freeing
@@ -571,7 +567,8 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 	}
 }
 
-/* It's important that we pack the spl_kmem_obj_t structure and the
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
  * actual objects in to one large address space to minimize the number
  * of calls to the allocator.  It is far better to do a few large
  * allocations and then subdivide it ourselves.  Now which allocator
@@ -662,14 +659,17 @@ out:
 	RETURN(sks);
 }
 
-/* Removes slab from complete or partial list, so it must
- * be called with the 'skc->skc_lock' held.
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
  */
 static void
-spl_slab_free(spl_kmem_slab_t *sks) {
+spl_slab_free(spl_kmem_slab_t *sks,
+	      struct list_head *sks_list, struct list_head *sko_list)
+{
 	spl_kmem_cache_t *skc;
 	spl_kmem_obj_t *sko, *n;
-	int size;
 	ENTRY;
 
 	ASSERT(sks->sks_magic == SKS_MAGIC);
@@ -682,114 +682,190 @@ spl_slab_free(spl_kmem_slab_t *sks) {
 	skc->skc_obj_total -= sks->sks_objs;
 	skc->skc_slab_total--;
 	list_del(&sks->sks_list);
-	size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
-	       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
 
 	/* Run destructors slab is being released */
 	list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
 		ASSERT(sko->sko_magic == SKO_MAGIC);
+		list_del(&sko->sko_list);
 
 		if (skc->skc_dtor)
 			skc->skc_dtor(sko->sko_addr, skc->skc_private);
 
 		if (skc->skc_flags & KMC_OFFSLAB)
-			kv_free(skc, sko->sko_addr, size);
+			list_add(&sko->sko_list, sko_list);
 	}
 
-	kv_free(skc, sks, skc->skc_slab_size);
+	list_add(&sks->sks_list, sks_list);
 	EXIT;
 }
 
-static int
-__spl_slab_reclaim(spl_kmem_cache_t *skc)
+/*
+ * Traverses all the partial slabs attached to a cache and free those
+ * which which are currently empty, and have not been touched for
+ * skc_delay seconds.  This is to avoid thrashing.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
 {
 	spl_kmem_slab_t *sks, *m;
-	int rc = 0;
+	spl_kmem_obj_t *sko, *n;
+	LIST_HEAD(sks_list);
+	LIST_HEAD(sko_list);
+	int size;
 	ENTRY;
 
-	ASSERT(spin_is_locked(&skc->skc_lock));
 	/*
-	 * Free empty slabs which have not been touched in skc_delay
-	 * seconds.  This delay time is important to avoid thrashing.
-	 * Empty slabs will be at the end of the skc_partial_list.
+	 * Move empty slabs and objects which have not been touched in
+	 * skc_delay seconds on to private lists to be freed outside
+	 * the spin lock.  This delay time is important to avoid
+	 * thrashing however when flag is set the delay will not be
+	 * used.  Empty slabs will be at the end of the skc_partial_list.
 	 */
+	spin_lock(&skc->skc_lock);
         list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
 					 sks_list) {
 		if (sks->sks_ref > 0)
 		       break;
 
-		if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
-			spl_slab_free(sks);
-			rc++;
-		}
+		if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
+			spl_slab_free(sks, &sks_list, &sko_list);
 	}
+	spin_unlock(&skc->skc_lock);
 
-	/* Returns number of slabs reclaimed */
-	RETURN(rc);
+	/*
+	 * We only have list of spl_kmem_obj_t's if they are located off
+	 * the slab, otherwise they get feed with the spl_kmem_slab_t.
+	 */
+	if (!list_empty(&sko_list)) {
+		ASSERT(skc->skc_flags & KMC_OFFSLAB);
+
+		size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
+		       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
+
+		list_for_each_entry_safe(sko, n, &sko_list, sko_list)
+			kv_free(skc, sko->sko_addr, size);
+	}
+
+	list_for_each_entry_safe(sks, m, &sks_list, sks_list)
+		kv_free(skc, sks, skc->skc_slab_size);
+
+	EXIT;
 }
 
-static int
-spl_slab_reclaim(spl_kmem_cache_t *skc)
+/*
+ * Called regularly on all caches to age objects out of the magazines
+ * which have not been access in skc->skc_delay seconds.  This prevents
+ * idle magazines from holding memory which might be better used by
+ * other caches or parts of the system.  The delay is present to
+ * prevent thrashing the magazine.
+ */
+static void
+spl_magazine_age(void *data)
 {
-	int rc;
-	ENTRY;
+	spl_kmem_cache_t *skc = data;
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 
-	spin_lock(&skc->skc_lock);
-	rc = __spl_slab_reclaim(skc);
-	spin_unlock(&skc->skc_lock);
+	if (skm->skm_avail > 0 &&
+	    time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
+		(void)spl_cache_flush(skc, skm, skm->skm_refill);
+}
 
-	RETURN(rc);
+/*
+ * Called regularly to keep a downward pressure on the size of idle
+ * magazines and to release free slabs from the cache.  This function
+ * never calls the registered reclaim function, that only occures
+ * under memory pressure or with a direct call to spl_kmem_reap().
+ */
+static void
+spl_cache_age(void *data)
+{
+        spl_kmem_cache_t *skc =
+		spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	on_each_cpu(spl_magazine_age, skc, 0, 1);
+	spl_slab_reclaim(skc, 0);
+
+	if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
+		schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
 }
 
-/* Size slabs properly to ensure they are not too large */
+/*
+ * Size a slab based on the size of each aliged object plus spl_kmem_obj_t.
+ * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page.  Also for
+ * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
+ * lower than this and we will fail.
+ */
 static int
 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 {
-	int max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
-	int align = skc->skc_obj_align;
-
-	*objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+	int sks_size, obj_size, max_size, align;
 
 	if (skc->skc_flags & KMC_OFFSLAB) {
+		*objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 		*size = sizeof(spl_kmem_slab_t);
 	} else {
-resize:
-		*size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
-			*objs * (P2ROUNDUP(skc->skc_obj_size, align) +
-		        P2ROUNDUP(sizeof(spl_kmem_obj_t), align));
+		align = skc->skc_obj_align;
+		sks_size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align);
+		obj_size = P2ROUNDUP(skc->skc_obj_size, align) +
+                           P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
+
+		if (skc->skc_flags & KMC_KMEM)
+			max_size = ((uint64_t)1 << (MAX_ORDER-1)) * PAGE_SIZE;
+		else
+			max_size = (32 * 1024 * 1024);
 
-		if (*size > max)
-			GOTO(resize, *objs = *objs - 1);
+		for (*size = PAGE_SIZE; *size <= max_size; *size += PAGE_SIZE) {
+			*objs = (*size - sks_size) / obj_size;
+			if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
+				RETURN(0);
+		}
 
-		ASSERT(*objs > 0);
+		/*
+		 * Unable to satisfy target objets per slab, fallback to
+		 * allocating a maximally sized slab and assuming it can
+		 * contain the minimum objects count use it.  If not fail.
+		 */
+		*size = max_size;
+		*objs = (*size - sks_size) / obj_size;
+		if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
+			RETURN(0);
 	}
 
-	ASSERTF(*size <= max, "%d < %d\n", *size, max);
-	RETURN(0);
+	RETURN(-ENOSPC);
 }
 
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine.  Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
 static int
 spl_magazine_size(spl_kmem_cache_t *skc)
 {
 	int size, align = skc->skc_obj_align;
 	ENTRY;
 
-	/* Guesses for reasonable magazine sizes, they
-	 * should really adapt based on observed usage. */
+	/* Per-magazine sizes below assume a 4Kib page size */
 	if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
-		size = 4;
+		size = 4;  /* Minimum 4Mib per-magazine */
 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
-		size = 16;
+		size = 16; /* Minimum 2Mib per-magazine */
 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
-		size = 64;
+		size = 64; /* Minimum 256Kib per-magazine */
 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
-		size = 128;
+		size = 128; /* Minimum 128Kib per-magazine */
 	else
-		size = 512;
+		size = 256;
 
 	RETURN(size);
 }
 
+/*
+ * Allocate a per-cpu magazine to assoicate with a specific core.
+ */
 static spl_kmem_magazine_t *
 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
 {
@@ -798,19 +874,21 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
 	           sizeof(void *) * skc->skc_mag_size;
 	ENTRY;
 
-	skm = kmem_alloc_node(size, GFP_KERNEL, node);
+	skm = kmem_alloc_node(size, GFP_KERNEL | __GFP_NOFAIL, node);
 	if (skm) {
 		skm->skm_magic = SKM_MAGIC;
 		skm->skm_avail = 0;
 		skm->skm_size = skc->skc_mag_size;
 		skm->skm_refill = skc->skc_mag_refill;
-		if (!(skc->skc_flags & KMC_NOTOUCH))
-			skm->skm_age = jiffies;
+		skm->skm_age = jiffies;
 	}
 
 	RETURN(skm);
 }
 
+/*
+ * Free a per-cpu magazine assoicated with a specific core.
+ */
 static void
 spl_magazine_free(spl_kmem_magazine_t *skm)
 {
@@ -825,44 +903,72 @@ spl_magazine_free(spl_kmem_magazine_t *skm)
 	EXIT;
 }
 
+static void
+__spl_magazine_create(void *data)
+{
+        spl_kmem_cache_t *skc = data;
+	int id = smp_processor_id();
+
+	skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
+	ASSERT(skc->skc_mag[id]);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
 static int
 spl_magazine_create(spl_kmem_cache_t *skc)
 {
-	int i;
 	ENTRY;
 
 	skc->skc_mag_size = spl_magazine_size(skc);
-	skc->skc_mag_refill = (skc->skc_mag_size + 1)  / 2;
+	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+	on_each_cpu(__spl_magazine_create, skc, 0, 1);
 
-	for_each_online_cpu(i) {
-		skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
-		if (!skc->skc_mag[i]) {
-			for (i--; i >= 0; i--)
-				spl_magazine_free(skc->skc_mag[i]);
+	RETURN(0);
+}
 
-			RETURN(-ENOMEM);
-		}
-	}
+static void
+__spl_magazine_destroy(void *data)
+{
+        spl_kmem_cache_t *skc = data;
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 
-	RETURN(0);
+	(void)spl_cache_flush(skc, skm, skm->skm_avail);
+	spl_magazine_free(skm);
 }
 
+/*
+ * Destroy all pre-cpu magazines.
+ */
 static void
 spl_magazine_destroy(spl_kmem_cache_t *skc)
 {
-        spl_kmem_magazine_t *skm;
-	int i;
 	ENTRY;
-
-	for_each_online_cpu(i) {
-		skm = skc->skc_mag[i];
-		(void)spl_cache_flush(skc, skm, skm->skm_avail);
-		spl_magazine_free(skm);
-	}
-
+	on_each_cpu(__spl_magazine_destroy, skc, 0, 1);
 	EXIT;
 }
 
+/*
+ * Create a object cache based on the following arguments:
+ * name		cache name
+ * size		cache object size
+ * align	cache object alignment
+ * ctor		cache object constructor
+ * dtor		cache object destructor
+ * reclaim	cache object reclaim
+ * priv		cache private data for ctor/dtor/reclaim
+ * vmp		unused must be NULL
+ * flags
+ *	KMC_NOTOUCH	Disable cache object aging (unsupported)
+ *	KMC_NODEBUG	Disable debugging (unsupported)
+ *	KMC_NOMAGAZINE	Disable magazine (unsupported)
+ *	KMC_NOHASH      Disable hashing (unsupported)
+ *	KMC_QCACHE	Disable qcache (unsupported)
+ *	KMC_KMEM	Force kmem backed cache
+ *	KMC_VMEM        Force vmem backed cache
+ *	KMC_OFFSLAB	Locate objects off the slab
+ */
 spl_kmem_cache_t *
 spl_kmem_cache_create(char *name, size_t size, size_t align,
                       spl_kmem_ctor_t ctor,
@@ -908,6 +1014,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	skc->skc_obj_size = size;
 	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 	skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+	atomic_set(&skc->skc_ref, 0);
 
 	INIT_LIST_HEAD(&skc->skc_list);
 	INIT_LIST_HEAD(&skc->skc_complete_list);
@@ -947,6 +1054,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	if (rc)
 		GOTO(out, rc);
 
+	spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
+	schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+
 	down_write(&spl_kmem_cache_sem);
 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 	up_write(&spl_kmem_cache_sem);
@@ -959,10 +1069,13 @@ out:
 }
 EXPORT_SYMBOL(spl_kmem_cache_create);
 
+/*
+ * Destroy a cache and all objects assoicated with the cache.
+ */
 void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 {
-        spl_kmem_slab_t *sks, *m;
+	DECLARE_WAIT_QUEUE_HEAD(wq);
 	ENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -971,20 +1084,27 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	list_del_init(&skc->skc_list);
 	up_write(&spl_kmem_cache_sem);
 
+	/* Cancel any and wait for any pending delayed work */
+	ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+	cancel_delayed_work(&skc->skc_work);
+	flush_scheduled_work();
+
+	/* Wait until all current callers complete, this is mainly
+	 * to catch the case where a low memory situation triggers a
+	 * cache reaping action which races with this destroy. */
+	wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
 	spl_magazine_destroy(skc);
+	spl_slab_reclaim(skc, 1);
 	spin_lock(&skc->skc_lock);
 
 	/* Validate there are no objects in use and free all the
 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
+	ASSERT3U(skc->skc_slab_alloc, ==, 0);
+	ASSERT3U(skc->skc_obj_alloc, ==, 0);
+	ASSERT3U(skc->skc_slab_total, ==, 0);
+	ASSERT3U(skc->skc_obj_total, ==, 0);
 	ASSERT(list_empty(&skc->skc_complete_list));
-	ASSERT(skc->skc_slab_alloc == 0);
-	ASSERT(skc->skc_obj_alloc == 0);
-
-	list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
-		spl_slab_free(sks);
-
-	ASSERT(skc->skc_slab_total == 0);
-	ASSERT(skc->skc_obj_total == 0);
 
 	kmem_free(skc->skc_name, skc->skc_name_size);
 	spin_unlock(&skc->skc_lock);
@@ -995,6 +1115,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 }
 EXPORT_SYMBOL(spl_kmem_cache_destroy);
 
+/*
+ * Allocate an object from a slab attached to the cache.  This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
 static void *
 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 {
@@ -1030,10 +1154,11 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 	return sko->sko_addr;
 }
 
-/* No available objects create a new slab.  Since this is an
- * expensive operation we do it without holding the spinlock
- * and only briefly aquire it when we link in the fully
- * allocated and constructed slab.
+/*
+ * No available objects on any slabsi, create a new slab.  Since this
+ * is an expensive operation we do it without holding the spinlock and
+ * only briefly aquire it when we link in the fully allocated and
+ * constructed slab.
  */
 static spl_kmem_slab_t *
 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
@@ -1042,34 +1167,42 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags)
 	ENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
+	local_irq_enable();
+	might_sleep();
 
-	if (flags & __GFP_WAIT) {
-		flags |= __GFP_NOFAIL;
-		local_irq_enable();
-		might_sleep();
-	}
-
-	sks = spl_slab_alloc(skc, flags);
-	if (sks == NULL) {
-	        if (flags & __GFP_WAIT)
-			local_irq_disable();
-
-		RETURN(NULL);
+	/*
+	 * Before allocating a new slab check if the slab is being reaped.
+	 * If it is there is a good chance we can wait until it finishes
+	 * and then use one of the newly freed but not aged-out slabs.
+	 */
+	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		schedule();
+		GOTO(out, sks= NULL);
 	}
 
-	if (flags & __GFP_WAIT)
-		local_irq_disable();
+	/* Allocate a new slab for the cache */
+	sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | __GFP_NOWARN);
+	if (sks == NULL)
+		GOTO(out, sks = NULL);
 
-	/* Link the new empty slab in to the end of skc_partial_list */
+	/* Link the new empty slab in to the end of skc_partial_list. */
 	spin_lock(&skc->skc_lock);
 	skc->skc_slab_total++;
 	skc->skc_obj_total += sks->sks_objs;
 	list_add_tail(&sks->sks_list, &skc->skc_partial_list);
 	spin_unlock(&skc->skc_lock);
+out:
+	local_irq_disable();
 
 	RETURN(sks);
 }
 
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this
+ * cache.  Ideally the magazine can be repopulated using existing
+ * objects which have been released, however if we are unable to
+ * locate enough free objects new slabs of objects will be created.
+ */
 static int
 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 {
@@ -1080,13 +1213,11 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
-	/* XXX: Check for refill bouncing by age perhaps */
 	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
-
 	spin_lock(&skc->skc_lock);
 
 	while (refill > 0) {
-		/* No slabs available we must grow the cache */
+		/* No slabs available we may need to grow the cache */
 		if (list_empty(&skc->skc_partial_list)) {
 			spin_unlock(&skc->skc_lock);
 
@@ -1135,6 +1266,9 @@ out:
 	RETURN(rc);
 }
 
+/*
+ * Release an object back to the slab from which it came.
+ */
 static void
 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
 {
@@ -1176,6 +1310,13 @@ spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
 	EXIT;
 }
 
+/*
+ * Release a batch of objects from a per-cpu magazine back to their
+ * respective slabs.  This occurs when we exceed the magazine size,
+ * are under memory pressure, when the cache is idle, or during
+ * cache cleanup.  The flush argument contains the number of entries
+ * to remove from the magazine.
+ */
 static int
 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 {
@@ -1185,12 +1326,17 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
+	/*
+	 * XXX: Currently we simply return objects from the magazine to
+	 * the slabs in fifo order.  The ideal thing to do from a memory
+	 * fragmentation standpoint is to cheaply determine the set of
+	 * objects in the magazine which will result in the largest
+	 * number of free slabs if released from the magazine.
+	 */
 	spin_lock(&skc->skc_lock);
-
 	for (i = 0; i < count; i++)
 		spl_cache_shrink(skc, skm->skm_objs[i]);
 
-//	__spl_slab_reclaim(skc);
 	skm->skm_avail -= count;
 	memmove(skm->skm_objs, &(skm->skm_objs[count]),
 	        sizeof(void *) * skm->skm_avail);
@@ -1200,6 +1346,10 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 	RETURN(count);
 }
 
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
 void *
 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 {
@@ -1209,7 +1359,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 	ENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
-	ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+	ASSERT(flags & KM_SLEEP);
+	atomic_inc(&skc->skc_ref);
 	local_irq_save(irq_flags);
 
 restart:
@@ -1225,8 +1377,7 @@ restart:
 	if (likely(skm->skm_avail)) {
 		/* Object available in CPU cache, use it */
 		obj = skm->skm_objs[--skm->skm_avail];
-		if (!(skc->skc_flags & KMC_NOTOUCH))
-			skm->skm_age = jiffies;
+		skm->skm_age = jiffies;
 	} else {
 		/* Per-CPU cache empty, directly allocate from
 		 * the slab and refill the per-CPU cache. */
@@ -1240,11 +1391,18 @@ restart:
 
 	/* Pre-emptively migrate object to CPU L1 cache */
 	prefetchw(obj);
+	atomic_dec(&skc->skc_ref);
 
 	RETURN(obj);
 }
 EXPORT_SYMBOL(spl_kmem_cache_alloc);
 
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from.  We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
 void
 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 {
@@ -1253,6 +1411,8 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	ENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+	atomic_inc(&skc->skc_ref);
 	local_irq_save(flags);
 
 	/* Safe to update per-cpu structure without lock, but
@@ -1270,62 +1430,87 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	skm->skm_objs[skm->skm_avail++] = obj;
 
 	local_irq_restore(flags);
+	atomic_dec(&skc->skc_ref);
 
 	EXIT;
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);
 
+/*
+ * The generic shrinker function for all caches.  Under linux a shrinker
+ * may not be tightly coupled with a slab cache.  In fact linux always
+ * systematically trys calling all registered shrinker callbacks which
+ * report that they contain unused objects.  Because of this we only
+ * register one shrinker function in the shim layer for all slab caches.
+ * We always attempt to shrink all caches when this generic shrinker
+ * is called.  The shrinker should return the number of free objects
+ * in the cache when called with nr_to_scan == 0 but not attempt to
+ * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
+ * objects should be freed, because Solaris semantics are to free
+ * all available objects we may free more objects than requested.
+ */
 static int
 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
 {
 	spl_kmem_cache_t *skc;
+	int unused = 0;
 
-	/* Under linux a shrinker is not tightly coupled with a slab
-	 * cache.  In fact linux always systematically trys calling all
-	 * registered shrinker callbacks until its target reclamation level
-	 * is reached.  Because of this we only register one shrinker
-	 * function in the shim layer for all slab caches.  And we always
-	 * attempt to shrink all caches when this generic shrinker is called.
-	 */
 	down_read(&spl_kmem_cache_sem);
-
-	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
-		spl_kmem_cache_reap_now(skc);
-
+	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+		if (nr_to_scan)
+			spl_kmem_cache_reap_now(skc);
+
+		/*
+		 * Presume everything alloc'ed in reclaimable, this ensures
+		 * we are called again with nr_to_scan > 0 so can try and
+		 * reclaim.  The exact number is not important either so
+		 * we forgo taking this already highly contented lock.
+		 */
+		unused += skc->skc_obj_alloc;
+	}
 	up_read(&spl_kmem_cache_sem);
 
-	/* XXX: Under linux we should return the remaining number of
-	 * entries in the cache.  We should do this as well.
-	 */
-	return 1;
+	return (unused * sysctl_vfs_cache_pressure) / 100;
 }
 
+/*
+ * Call the registered reclaim function for a cache.  Depending on how
+ * many and which objects are released it may simply repopulate the
+ * local magazine which will then need to age-out.  Objects which cannot
+ * fit in the magazine we will be released back to their slabs which will
+ * also need to age out before being release.  This is all just best
+ * effort and we do not want to thrash creating and destroying slabs.
+ */
 void
 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
 {
-	spl_kmem_magazine_t *skm;
-	int i;
 	ENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
-	if (skc->skc_reclaim)
-		skc->skc_reclaim(skc->skc_private);
+	/* Prevent concurrent cache reaping when contended */
+	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		EXIT;
+		return;
+	}
 
-	/* Ensure per-CPU caches which are idle gradually flush */
-	for_each_online_cpu(i) {
-		skm = skc->skc_mag[i];
+	atomic_inc(&skc->skc_ref);
 
-		if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
-			(void)spl_cache_flush(skc, skm, skm->skm_refill);
-	}
+	if (skc->skc_reclaim)
+		skc->skc_reclaim(skc->skc_private);
 
-	spl_slab_reclaim(skc);
+	spl_slab_reclaim(skc, 0);
+	clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
+	atomic_dec(&skc->skc_ref);
 
 	EXIT;
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
 
+/*
+ * Reap all free slabs from all registered caches.
+ */
 void
 spl_kmem_reap(void)
 {
diff --git a/module/splat/splat-internal.h b/module/splat/splat-internal.h
index 87c47b173..0fa177c02 100644
--- a/module/splat/splat-internal.h
+++ b/module/splat/splat-internal.h
@@ -40,6 +40,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/list.h>
+#include <linux/swap.h>
 
 #include <asm/ioctls.h>
 #include <asm/uaccess.h>
diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c
index 9b96fce90..c592e983c 100644
--- a/module/splat/splat-kmem.c
+++ b/module/splat/splat-kmem.c
@@ -4,9 +4,9 @@
  *  Copyright (c) 2008 Lawrence Livermore National Security, LLC.
  *  Produced at Lawrence Livermore National Laboratory
  *  Written by:
- *          Brian Behlendorf <[email protected]>,
- *          Herb Wartens <[email protected]>,
- *          Jim Garlick <[email protected]>
+ *	  Brian Behlendorf <[email protected]>,
+ *	  Herb Wartens <[email protected]>,
+ *	  Jim Garlick <[email protected]>
  *  UCRL-CODE-235197
  *
  *  This is free software; you can redistribute it and/or modify it
@@ -47,30 +47,37 @@
 #define SPLAT_KMEM_TEST4_DESC		"Memory allocation test (vmem_zalloc)"
 
 #define SPLAT_KMEM_TEST5_ID		0x0105
-#define SPLAT_KMEM_TEST5_NAME		"kmem_small"
+#define SPLAT_KMEM_TEST5_NAME		"slab_small"
 #define SPLAT_KMEM_TEST5_DESC		"Slab ctor/dtor test (small)"
 
 #define SPLAT_KMEM_TEST6_ID		0x0106
-#define SPLAT_KMEM_TEST6_NAME		"kmem_large"
+#define SPLAT_KMEM_TEST6_NAME		"slab_large"
 #define SPLAT_KMEM_TEST6_DESC		"Slab ctor/dtor test (large)"
 
 #define SPLAT_KMEM_TEST7_ID		0x0107
-#define SPLAT_KMEM_TEST7_NAME		"kmem_reap"
-#define SPLAT_KMEM_TEST7_DESC		"Slab reaping test"
+#define SPLAT_KMEM_TEST7_NAME		"slab_align"
+#define SPLAT_KMEM_TEST7_DESC		"Slab alignment test"
 
 #define SPLAT_KMEM_TEST8_ID		0x0108
-#define SPLAT_KMEM_TEST8_NAME		"kmem_lock"
-#define SPLAT_KMEM_TEST8_DESC		"Slab locking test"
+#define SPLAT_KMEM_TEST8_NAME		"slab_reap"
+#define SPLAT_KMEM_TEST8_DESC		"Slab reaping test"
 
 #define SPLAT_KMEM_TEST9_ID		0x0109
-#define SPLAT_KMEM_TEST9_NAME		"kmem_align"
-#define SPLAT_KMEM_TEST9_DESC		"Slab alignment test"
+#define SPLAT_KMEM_TEST9_NAME		"slab_age"
+#define SPLAT_KMEM_TEST9_DESC		"Slab aging test"
+
+#define SPLAT_KMEM_TEST10_ID		0x010a
+#define SPLAT_KMEM_TEST10_NAME		"slab_lock"
+#define SPLAT_KMEM_TEST10_DESC		"Slab locking test"
+
+#define SPLAT_KMEM_TEST11_ID		0x010b
+#define SPLAT_KMEM_TEST11_NAME		"slab_overcommit"
+#define SPLAT_KMEM_TEST11_DESC		"Slab memory overcommit test"
 
 #define SPLAT_KMEM_ALLOC_COUNT		10
 #define SPLAT_VMEM_ALLOC_COUNT		10
 
 
-/* XXX - This test may fail under tight memory conditions */
 static int
 splat_kmem_test1(struct file *file, void *arg)
 {
@@ -96,8 +103,8 @@ splat_kmem_test1(struct file *file, void *arg)
 				kmem_free(ptr[i], size);
 
 		splat_vprint(file, SPLAT_KMEM_TEST1_NAME,
-	                   "%d byte allocations, %d/%d successful\n",
-		           size, count, SPLAT_KMEM_ALLOC_COUNT);
+			   "%d byte allocations, %d/%d successful\n",
+			   size, count, SPLAT_KMEM_ALLOC_COUNT);
 		if (count != SPLAT_KMEM_ALLOC_COUNT)
 			rc = -ENOMEM;
 
@@ -134,8 +141,8 @@ splat_kmem_test2(struct file *file, void *arg)
 			for (j = 0; j < size; j++) {
 				if (((char *)ptr[i])[j] != '\0') {
 					splat_vprint(file, SPLAT_KMEM_TEST2_NAME,
-				                  "%d-byte allocation was "
-					          "not zeroed\n", size);
+						  "%d-byte allocation was "
+						  "not zeroed\n", size);
 					rc = -EFAULT;
 				}
 			}
@@ -146,8 +153,8 @@ splat_kmem_test2(struct file *file, void *arg)
 				kmem_free(ptr[i], size);
 
 		splat_vprint(file, SPLAT_KMEM_TEST2_NAME,
-	                   "%d byte allocations, %d/%d successful\n",
-		           size, count, SPLAT_KMEM_ALLOC_COUNT);
+			   "%d byte allocations, %d/%d successful\n",
+			   size, count, SPLAT_KMEM_ALLOC_COUNT);
 		if (count != SPLAT_KMEM_ALLOC_COUNT)
 			rc = -ENOMEM;
 
@@ -180,8 +187,8 @@ splat_kmem_test3(struct file *file, void *arg)
 				vmem_free(ptr[i], size);
 
 		splat_vprint(file, SPLAT_KMEM_TEST3_NAME,
-	                   "%d byte allocations, %d/%d successful\n",
-		           size, count, SPLAT_VMEM_ALLOC_COUNT);
+			   "%d byte allocations, %d/%d successful\n",
+			   size, count, SPLAT_VMEM_ALLOC_COUNT);
 		if (count != SPLAT_VMEM_ALLOC_COUNT)
 			rc = -ENOMEM;
 
@@ -212,8 +219,8 @@ splat_kmem_test4(struct file *file, void *arg)
 			for (j = 0; j < size; j++) {
 				if (((char *)ptr[i])[j] != '\0') {
 					splat_vprint(file, SPLAT_KMEM_TEST4_NAME,
-				                  "%d-byte allocation was "
-					          "not zeroed\n", size);
+						  "%d-byte allocation was "
+						  "not zeroed\n", size);
 					rc = -EFAULT;
 				}
 			}
@@ -224,8 +231,8 @@ splat_kmem_test4(struct file *file, void *arg)
 				vmem_free(ptr[i], size);
 
 		splat_vprint(file, SPLAT_KMEM_TEST4_NAME,
-	                   "%d byte allocations, %d/%d successful\n",
-		           size, count, SPLAT_VMEM_ALLOC_COUNT);
+			   "%d byte allocations, %d/%d successful\n",
+			   size, count, SPLAT_VMEM_ALLOC_COUNT);
 		if (count != SPLAT_VMEM_ALLOC_COUNT)
 			rc = -ENOMEM;
 
@@ -237,8 +244,11 @@ splat_kmem_test4(struct file *file, void *arg)
 
 #define SPLAT_KMEM_TEST_MAGIC		0x004488CCUL
 #define SPLAT_KMEM_CACHE_NAME		"kmem_test"
-#define SPLAT_KMEM_OBJ_COUNT		128
-#define SPLAT_KMEM_OBJ_RECLAIM		16
+#define SPLAT_KMEM_OBJ_COUNT		1024
+#define SPLAT_KMEM_OBJ_RECLAIM		20 /* percent */
+#define SPLAT_KMEM_THREADS		32
+
+#define KCP_FLAG_READY			0x01
 
 typedef struct kmem_cache_data {
 	unsigned long kcd_magic;
@@ -246,21 +256,95 @@ typedef struct kmem_cache_data {
 	char kcd_buf[0];
 } kmem_cache_data_t;
 
+typedef struct kmem_cache_thread {
+	kmem_cache_t *kct_cache;
+	spinlock_t kct_lock;
+	int kct_id;
+	int kct_kcd_count;
+	kmem_cache_data_t *kct_kcd[0];
+} kmem_cache_thread_t;
+
 typedef struct kmem_cache_priv {
 	unsigned long kcp_magic;
 	struct file *kcp_file;
 	kmem_cache_t *kcp_cache;
-	kmem_cache_data_t *kcp_kcd[SPLAT_KMEM_OBJ_COUNT];
 	spinlock_t kcp_lock;
-	wait_queue_head_t kcp_waitq;
+	wait_queue_head_t kcp_ctl_waitq;
+	wait_queue_head_t kcp_thr_waitq;
+	int kcp_flags;
+	int kcp_kct_count;
+	kmem_cache_thread_t *kcp_kct[SPLAT_KMEM_THREADS];
 	int kcp_size;
 	int kcp_align;
 	int kcp_count;
-	int kcp_threads;
 	int kcp_alloc;
 	int kcp_rc;
+	int kcp_kcd_count;
+	kmem_cache_data_t *kcp_kcd[0];
 } kmem_cache_priv_t;
 
+static kmem_cache_priv_t *
+splat_kmem_cache_test_kcp_alloc(struct file *file, char *name,
+				int size, int align, int alloc, int count)
+{
+	kmem_cache_priv_t *kcp;
+
+	kcp = vmem_zalloc(sizeof(kmem_cache_priv_t) +
+			  count * sizeof(kmem_cache_data_t *), KM_SLEEP);
+	if (!kcp)
+		return NULL;
+
+	kcp->kcp_magic = SPLAT_KMEM_TEST_MAGIC;
+	kcp->kcp_file = file;
+	kcp->kcp_cache = NULL;
+	spin_lock_init(&kcp->kcp_lock);
+	init_waitqueue_head(&kcp->kcp_ctl_waitq);
+	init_waitqueue_head(&kcp->kcp_thr_waitq);
+	kcp->kcp_flags = 0;
+	kcp->kcp_kct_count = -1;
+	kcp->kcp_size = size;
+	kcp->kcp_align = align;
+	kcp->kcp_count = 0;
+	kcp->kcp_alloc = alloc;
+	kcp->kcp_rc = 0;
+	kcp->kcp_kcd_count = count;
+
+	return kcp;
+}
+
+static void
+splat_kmem_cache_test_kcp_free(kmem_cache_priv_t *kcp)
+{
+	vmem_free(kcp, sizeof(kmem_cache_priv_t) +
+		  kcp->kcp_kcd_count * sizeof(kmem_cache_data_t *));
+}
+
+static kmem_cache_thread_t *
+splat_kmem_cache_test_kct_alloc(int id, int count)
+{
+	kmem_cache_thread_t *kct;
+
+	ASSERTF(id < SPLAT_KMEM_THREADS, "id=%d\n", id);
+	kct = vmem_zalloc(sizeof(kmem_cache_thread_t) +
+			  count * sizeof(kmem_cache_data_t *), KM_SLEEP);
+	if (!kct)
+		return NULL;
+
+	spin_lock_init(&kct->kct_lock);
+	kct->kct_cache = NULL;
+	kct->kct_id = id;
+	kct->kct_kcd_count = count;
+
+	return kct;
+}
+
+static void
+splat_kmem_cache_test_kct_free(kmem_cache_thread_t *kct)
+{
+	vmem_free(kct, sizeof(kmem_cache_thread_t) +
+		  kct->kct_kcd_count * sizeof(kmem_cache_data_t *));
+}
+
 static int
 splat_kmem_cache_test_constructor(void *ptr, void *priv, int flags)
 {
@@ -293,83 +377,340 @@ splat_kmem_cache_test_destructor(void *ptr, void *priv)
 	return;
 }
 
+/*
+ * Generic reclaim function which assumes that all objects may
+ * be reclaimed at any time.  We free a small  percentage of the
+ * objects linked off the kcp or kct[] every time we are called.
+ */
+static void
+splat_kmem_cache_test_reclaim(void *priv)
+{
+	kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv;
+	kmem_cache_thread_t *kct;
+	int i, j, count;
+
+	ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC);
+	count = kcp->kcp_kcd_count * SPLAT_KMEM_OBJ_RECLAIM / 100;
+
+	/* Objects directly attached to the kcp */
+	spin_lock(&kcp->kcp_lock);
+	for (i = 0; i < kcp->kcp_kcd_count; i++) {
+		if (kcp->kcp_kcd[i]) {
+			kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
+			kcp->kcp_kcd[i] = NULL;
+
+			if ((--count) == 0)
+				break;
+		}
+	}
+	spin_unlock(&kcp->kcp_lock);
+
+	/* No threads containing objects to consider */
+	if (kcp->kcp_kct_count == -1)
+		return;
+
+	/* Objects attached to a kct thread */
+	for (i = 0; i < kcp->kcp_kct_count; i++) {
+		spin_lock(&kcp->kcp_lock);
+		kct = kcp->kcp_kct[i];
+		spin_unlock(&kcp->kcp_lock);
+		if (!kct)
+			continue;
+
+		spin_lock(&kct->kct_lock);
+		count = kct->kct_kcd_count * SPLAT_KMEM_OBJ_RECLAIM / 100;
+
+		for (j = 0; j < kct->kct_kcd_count; j++) {
+			if (kct->kct_kcd[j]) {
+				kmem_cache_free(kcp->kcp_cache,kct->kct_kcd[j]);
+				kct->kct_kcd[j] = NULL;
+
+				if ((--count) == 0)
+					break;
+			}
+		}
+		spin_unlock(&kct->kct_lock);
+	}
+
+	return;
+}
+
+static int
+splat_kmem_cache_test_threads(kmem_cache_priv_t *kcp, int threads)
+{
+	int rc;
+
+	spin_lock(&kcp->kcp_lock);
+	rc = (kcp->kcp_kct_count == threads);
+	spin_unlock(&kcp->kcp_lock);
+
+	return rc;
+}
+
+static int
+splat_kmem_cache_test_flags(kmem_cache_priv_t *kcp, int flags)
+{
+	int rc;
+
+	spin_lock(&kcp->kcp_lock);
+	rc = (kcp->kcp_flags & flags);
+	spin_unlock(&kcp->kcp_lock);
+
+	return rc;
+}
+
+static void
+splat_kmem_cache_test_thread(void *arg)
+{
+	kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)arg;
+	kmem_cache_thread_t *kct;
+	int rc = 0, id, i;
+	void *obj;
+
+	ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC);
+
+	/* Assign thread ids */
+	spin_lock(&kcp->kcp_lock);
+	if (kcp->kcp_kct_count == -1)
+		kcp->kcp_kct_count = 0;
+
+	id = kcp->kcp_kct_count;
+	kcp->kcp_kct_count++;
+	spin_unlock(&kcp->kcp_lock);
+
+	kct = splat_kmem_cache_test_kct_alloc(id, kcp->kcp_alloc);
+	if (!kct) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock(&kcp->kcp_lock);
+	kcp->kcp_kct[id] = kct;
+	spin_unlock(&kcp->kcp_lock);
+
+	/* Wait for all threads to have started and report they are ready */
+	if (kcp->kcp_kct_count == SPLAT_KMEM_THREADS)
+		wake_up(&kcp->kcp_ctl_waitq);
+
+	wait_event(kcp->kcp_thr_waitq,
+		splat_kmem_cache_test_flags(kcp, KCP_FLAG_READY));
+
+	/*
+	 * Updates to kct->kct_kcd[] are performed under a spin_lock so
+	 * they may safely run concurrent with the reclaim function.  If
+	 * we are not in a low memory situation we have one lock per-
+	 * thread so they are not expected to be contended.
+	 */
+	for (i = 0; i < kct->kct_kcd_count; i++) {
+		obj = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+		spin_lock(&kct->kct_lock);
+		kct->kct_kcd[i] = obj;
+		spin_unlock(&kct->kct_lock);
+	}
+
+	for (i = 0; i < kct->kct_kcd_count; i++) {
+		spin_lock(&kct->kct_lock);
+		if (kct->kct_kcd[i]) {
+			kmem_cache_free(kcp->kcp_cache, kct->kct_kcd[i]);
+			kct->kct_kcd[i] = NULL;
+		}
+		spin_unlock(&kct->kct_lock);
+	}
+out:
+	spin_lock(&kcp->kcp_lock);
+	if (kct) {
+		splat_kmem_cache_test_kct_free(kct);
+		kcp->kcp_kct[id] = kct = NULL;
+	}
+
+	if (!kcp->kcp_rc)
+		kcp->kcp_rc = rc;
+
+	if ((--kcp->kcp_kct_count) == 0)
+		wake_up(&kcp->kcp_ctl_waitq);
+
+	spin_unlock(&kcp->kcp_lock);
+
+	thread_exit();
+}
+
 static int
 splat_kmem_cache_test(struct file *file, void *arg, char *name,
-			   int size, int align, int flags)
+		      int size, int align, int flags)
 {
-	kmem_cache_t *cache = NULL;
-	kmem_cache_data_t *kcd = NULL;
-	kmem_cache_priv_t kcp;
+	kmem_cache_priv_t *kcp;
+	kmem_cache_data_t *kcd;
 	int rc = 0, max;
 
-	kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
-	kcp.kcp_file = file;
-	kcp.kcp_size = size;
-	kcp.kcp_align = align;
-	kcp.kcp_count = 0;
-	kcp.kcp_rc = 0;
-
-	cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
-				  kcp.kcp_size, kcp.kcp_align,
-	                          splat_kmem_cache_test_constructor,
-	                          splat_kmem_cache_test_destructor,
-	                          NULL, &kcp, NULL, flags);
-	if (!cache) {
+	kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, align, 0, 1);
+	if (!kcp) {
+		splat_vprint(file, name, "Unable to create '%s'\n", "kcp");
+		return -ENOMEM;
+	}
+
+	kcp->kcp_cache =
+		kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
+				  kcp->kcp_size, kcp->kcp_align,
+				  splat_kmem_cache_test_constructor,
+				  splat_kmem_cache_test_destructor,
+				  NULL, kcp, NULL, flags);
+	if (!kcp->kcp_cache) {
 		splat_vprint(file, name,
-	                     "Unable to create '%s'\n",
+			     "Unable to create '%s'\n",
 			     SPLAT_KMEM_CACHE_NAME);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto out_free;
 	}
 
-	kcd = kmem_cache_alloc(cache, KM_SLEEP);
+	kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
 	if (!kcd) {
 		splat_vprint(file, name,
-	                     "Unable to allocate from '%s'\n",
-		             SPLAT_KMEM_CACHE_NAME);
+			     "Unable to allocate from '%s'\n",
+			     SPLAT_KMEM_CACHE_NAME);
 		rc = -EINVAL;
 		goto out_free;
 	}
+	spin_lock(&kcp->kcp_lock);
+	kcp->kcp_kcd[0] = kcd;
+	spin_unlock(&kcp->kcp_lock);
 
-	if (!kcd->kcd_flag) {
+	if (!kcp->kcp_kcd[0]->kcd_flag) {
 		splat_vprint(file, name,
-		             "Failed to run contructor for '%s'\n",
-		             SPLAT_KMEM_CACHE_NAME);
+			     "Failed to run contructor for '%s'\n",
+			     SPLAT_KMEM_CACHE_NAME);
 		rc = -EINVAL;
 		goto out_free;
 	}
 
-	if (kcd->kcd_magic != kcp.kcp_magic) {
+	if (kcp->kcp_kcd[0]->kcd_magic != kcp->kcp_magic) {
 		splat_vprint(file, name,
-		             "Failed to pass private data to constructor "
-		             "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+			     "Failed to pass private data to constructor "
+			     "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
 		rc = -EINVAL;
 		goto out_free;
 	}
 
-	max = kcp.kcp_count;
-	kmem_cache_free(cache, kcd);
+	max = kcp->kcp_count;
+	spin_lock(&kcp->kcp_lock);
+	kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[0]);
+	kcp->kcp_kcd[0] = NULL;
+	spin_unlock(&kcp->kcp_lock);
 
 	/* Destroy the entire cache which will force destructors to
 	 * run and we can verify one was called for every object */
-	kmem_cache_destroy(cache);
-	if (kcp.kcp_count) {
+	kmem_cache_destroy(kcp->kcp_cache);
+	if (kcp->kcp_count) {
 		splat_vprint(file, name,
-		             "Failed to run destructor on all slab objects "
-		             "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+			     "Failed to run destructor on all slab objects "
+			     "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
 		rc = -EINVAL;
 	}
 
 	splat_vprint(file, name,
-	             "Successfully ran ctors/dtors for %d elements in '%s'\n",
-	             max, SPLAT_KMEM_CACHE_NAME);
+		     "Successfully ran ctors/dtors for %d elements in '%s'\n",
+		     max, SPLAT_KMEM_CACHE_NAME);
 
 	return rc;
 
 out_free:
-	if (kcd)
-		kmem_cache_free(cache, kcd);
+	if (kcp->kcp_kcd[0]) {
+		spin_lock(&kcp->kcp_lock);
+		kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[0]);
+		kcp->kcp_kcd[0] = NULL;
+		spin_unlock(&kcp->kcp_lock);
+	}
+
+	if (kcp->kcp_cache)
+		kmem_cache_destroy(kcp->kcp_cache);
+
+	splat_kmem_cache_test_kcp_free(kcp);
+
+	return rc;
+}
+
+static int
+splat_kmem_cache_thread_test(struct file *file, void *arg, char *name,
+			     int size, int alloc)
+{
+	kmem_cache_priv_t *kcp;
+	kthread_t *thr;
+	struct timespec start, stop, delta;
+	char cache_name[32];
+	int i, rc = 0;
+
+	kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, 0, alloc, 0);
+	if (!kcp) {
+		splat_vprint(file, name, "Unable to create '%s'\n", "kcp");
+		return -ENOMEM;
+	}
+
+	(void)snprintf(cache_name, 32, "%s-%d-%d",
+		       SPLAT_KMEM_CACHE_NAME, size, alloc);
+	kcp->kcp_cache =
+		kmem_cache_create(cache_name, kcp->kcp_size, 0,
+				  splat_kmem_cache_test_constructor,
+				  splat_kmem_cache_test_destructor,
+				  splat_kmem_cache_test_reclaim,
+				  kcp, NULL, KMC_VMEM);
+	if (!kcp->kcp_cache) {
+		splat_vprint(file, name, "Unable to create '%s'\n", cache_name);
+		rc = -ENOMEM;
+		goto out_kcp;
+	}
+
+	start = current_kernel_time();
+
+	for (i = 0; i < SPLAT_KMEM_THREADS; i++) {
+		thr = thread_create(NULL, 0,
+				    splat_kmem_cache_test_thread,
+				    kcp, 0, &p0, TS_RUN, minclsyspri);
+		if (thr == NULL) {
+			rc = -ESRCH;
+			goto out_cache;
+		}
+	}
+
+	/* Sleep until all threads have started, then set the ready
+	 * flag and wake them all up for maximum concurrency. */
+	wait_event(kcp->kcp_ctl_waitq,
+		   splat_kmem_cache_test_threads(kcp, SPLAT_KMEM_THREADS));
+
+	spin_lock(&kcp->kcp_lock);
+	kcp->kcp_flags |= KCP_FLAG_READY;
+	spin_unlock(&kcp->kcp_lock);
+	wake_up_all(&kcp->kcp_thr_waitq);
+
+	/* Sleep until all thread have finished */
+	wait_event(kcp->kcp_ctl_waitq, splat_kmem_cache_test_threads(kcp, 0));
+
+	stop = current_kernel_time();
+	delta = timespec_sub(stop, start);
 
-	kmem_cache_destroy(cache);
+	splat_vprint(file, name,
+		     "%-22s %2ld.%09ld\t"
+		     "%lu/%lu/%lu\t%lu/%lu/%lu\n",
+		     kcp->kcp_cache->skc_name,
+		     delta.tv_sec, delta.tv_nsec,
+		     (unsigned long)kcp->kcp_cache->skc_slab_total,
+		     (unsigned long)kcp->kcp_cache->skc_slab_max,
+		     (unsigned long)(kcp->kcp_alloc *
+				    SPLAT_KMEM_THREADS /
+				    SPL_KMEM_CACHE_OBJ_PER_SLAB),
+		     (unsigned long)kcp->kcp_cache->skc_obj_total,
+		     (unsigned long)kcp->kcp_cache->skc_obj_max,
+		     (unsigned long)(kcp->kcp_alloc *
+				     SPLAT_KMEM_THREADS));
+
+	if (delta.tv_sec >= 5)
+		rc = -ETIME;
+
+	if (!rc && kcp->kcp_rc)
+		rc = kcp->kcp_rc;
+
+out_cache:
+	kmem_cache_destroy(kcp->kcp_cache);
+out_kcp:
+	splat_kmem_cache_test_kcp_free(kcp);
 	return rc;
 }
 
@@ -409,291 +750,279 @@ splat_kmem_test6(struct file *file, void *arg)
 	return splat_kmem_cache_test(file, arg, name, 128*1028, 0, KMC_VMEM);
 }
 
-static void
-splat_kmem_cache_test_reclaim(void *priv)
+/* Validate object alignment cache behavior for caches */
+static int
+splat_kmem_test7(struct file *file, void *arg)
 {
-	kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv;
-	int i, count;
-
-	count = min(SPLAT_KMEM_OBJ_RECLAIM, kcp->kcp_count);
-	splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST7_NAME,
-                     "Reaping %d objects from '%s'\n", count,
-	             SPLAT_KMEM_CACHE_NAME);
-
-	for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) {
-		if (kcp->kcp_kcd[i]) {
-			kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
-			kcp->kcp_kcd[i] = NULL;
+	char *name = SPLAT_KMEM_TEST7_NAME;
+	int i, rc;
 
-			if (--count == 0)
-				break;
-		}
+	for (i = 8; i <= PAGE_SIZE; i *= 2) {
+		rc = splat_kmem_cache_test(file, arg, name, 157, i, 0);
+		if (rc)
+			return rc;
 	}
 
-	return;
+	return rc;
 }
 
 static int
-splat_kmem_test7(struct file *file, void *arg)
+splat_kmem_test8(struct file *file, void *arg)
 {
-	kmem_cache_t *cache;
-	kmem_cache_priv_t kcp;
-	int i, rc = 0;
-
-	kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
-	kcp.kcp_file = file;
-	kcp.kcp_size = 256;
-	kcp.kcp_count = 0;
-	kcp.kcp_rc = 0;
-
-	cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp.kcp_size, 0,
-	                          splat_kmem_cache_test_constructor,
-	                          splat_kmem_cache_test_destructor,
-	                          splat_kmem_cache_test_reclaim,
-				  &kcp, NULL, 0);
-	if (!cache) {
-		splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
-	                   "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME);
+	kmem_cache_priv_t *kcp;
+	kmem_cache_data_t *kcd;
+	int i, j, rc = 0;
+
+	kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST8_NAME,
+					      256, 0, 0, SPLAT_KMEM_OBJ_COUNT);
+	if (!kcp) {
+		splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+			     "Unable to create '%s'\n", "kcp");
 		return -ENOMEM;
 	}
 
-	kcp.kcp_cache = cache;
+	kcp->kcp_cache =
+		kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0,
+				  splat_kmem_cache_test_constructor,
+				  splat_kmem_cache_test_destructor,
+				  splat_kmem_cache_test_reclaim,
+				  kcp, NULL, 0);
+	if (!kcp->kcp_cache) {
+		splat_kmem_cache_test_kcp_free(kcp);
+		splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+			   "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME);
+		return -ENOMEM;
+	}
 
 	for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) {
-		/* All allocations need not succeed */
-		kcp.kcp_kcd[i] = kmem_cache_alloc(cache, KM_SLEEP);
-		if (!kcp.kcp_kcd[i]) {
-			splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
-		                   "Unable to allocate from '%s'\n",
-			           SPLAT_KMEM_CACHE_NAME);
+		kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+		spin_lock(&kcp->kcp_lock);
+		kcp->kcp_kcd[i] = kcd;
+		spin_unlock(&kcp->kcp_lock);
+		if (!kcd) {
+			splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+				   "Unable to allocate from '%s'\n",
+				   SPLAT_KMEM_CACHE_NAME);
 		}
 	}
 
-	ASSERT(kcp.kcp_count > 0);
-
 	/* Request the slab cache free any objects it can.  For a few reasons
 	 * this may not immediately result in more free memory even if objects
 	 * are freed.  First off, due to fragmentation we may not be able to
 	 * reclaim any slabs.  Secondly, even if we do we fully clear some
 	 * slabs we will not want to immedately reclaim all of them because
 	 * we may contend with cache allocs and thrash.  What we want to see
-	 * is slab size decrease more gradually as it becomes clear they
+	 * is the slab size decrease more gradually as it becomes clear they
 	 * will not be needed.  This should be acheivable in less than minute
 	 * if it takes longer than this something has gone wrong.
 	 */
 	for (i = 0; i < 60; i++) {
-		kmem_cache_reap_now(cache);
-		splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
-                             "%s cache objects %d, slabs %u/%u objs %u/%u\n",
-			     SPLAT_KMEM_CACHE_NAME, kcp.kcp_count,
-			    (unsigned)cache->skc_slab_alloc,
-			    (unsigned)cache->skc_slab_total,
-			    (unsigned)cache->skc_obj_alloc,
-			    (unsigned)cache->skc_obj_total);
-
-		if (cache->skc_obj_total == 0)
+		kmem_cache_reap_now(kcp->kcp_cache);
+		splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+			     "%s cache objects %d, slabs %u/%u objs %u/%u mags ",
+			     SPLAT_KMEM_CACHE_NAME, kcp->kcp_count,
+			    (unsigned)kcp->kcp_cache->skc_slab_alloc,
+			    (unsigned)kcp->kcp_cache->skc_slab_total,
+			    (unsigned)kcp->kcp_cache->skc_obj_alloc,
+			    (unsigned)kcp->kcp_cache->skc_obj_total);
+
+		for_each_online_cpu(j)
+			splat_print(file, "%u/%u ",
+				     kcp->kcp_cache->skc_mag[j]->skm_avail,
+				     kcp->kcp_cache->skc_mag[j]->skm_size);
+
+		splat_print(file, "%s\n", "");
+
+		if (kcp->kcp_cache->skc_obj_total == 0)
 			break;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ);
 	}
 
-	if (cache->skc_obj_total == 0) {
-		splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
+	if (kcp->kcp_cache->skc_obj_total == 0) {
+		splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
 			"Successfully created %d objects "
 			"in cache %s and reclaimed them\n",
-		        SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
+			SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
 	} else {
-		splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
+		splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
 			"Failed to reclaim %u/%d objects from cache %s\n",
-		        (unsigned)cache->skc_obj_total, SPLAT_KMEM_OBJ_COUNT,
-			SPLAT_KMEM_CACHE_NAME);
+			(unsigned)kcp->kcp_cache->skc_obj_total,
+			SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
 		rc = -ENOMEM;
 	}
 
 	/* Cleanup our mess (for failure case of time expiring) */
+	spin_lock(&kcp->kcp_lock);
 	for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++)
-		if (kcp.kcp_kcd[i])
-			kmem_cache_free(cache, kcp.kcp_kcd[i]);
+		if (kcp->kcp_kcd[i])
+			kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
+	spin_unlock(&kcp->kcp_lock);
 
-	kmem_cache_destroy(cache);
+	kmem_cache_destroy(kcp->kcp_cache);
+	splat_kmem_cache_test_kcp_free(kcp);
 
 	return rc;
 }
 
-static void
-splat_kmem_test8_thread(void *arg)
+static int
+splat_kmem_test9(struct file *file, void *arg)
 {
-	kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)arg;
-	int count = kcp->kcp_alloc, rc = 0, i;
-	void **objs;
-
-	ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC);
+	kmem_cache_priv_t *kcp;
+	kmem_cache_data_t *kcd;
+	int i, j, rc = 0, count = SPLAT_KMEM_OBJ_COUNT * 128;
+
+	kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST9_NAME,
+					      256, 0, 0, count);
+	if (!kcp) {
+		splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+			     "Unable to create '%s'\n", "kcp");
+		return -ENOMEM;
+	}
 
-	objs = vmem_zalloc(count * sizeof(void *), KM_SLEEP);
-	if (!objs) {
-		splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST8_NAME,
-	                     "Unable to alloc objp array for cache '%s'\n",
-		             kcp->kcp_cache->skc_name);
-		rc = -ENOMEM;
-		goto out;
+	kcp->kcp_cache =
+		kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0,
+				  splat_kmem_cache_test_constructor,
+				  splat_kmem_cache_test_destructor,
+				  NULL, kcp, NULL, 0);
+	if (!kcp->kcp_cache) {
+		splat_kmem_cache_test_kcp_free(kcp);
+		splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+			   "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME);
+		return -ENOMEM;
 	}
 
 	for (i = 0; i < count; i++) {
-		objs[i] = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
-		if (!objs[i]) {
-			splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST8_NAME,
-		                     "Unable to allocate from cache '%s'\n",
-			             kcp->kcp_cache->skc_name);
-			rc = -ENOMEM;
-			break;
+		kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+		spin_lock(&kcp->kcp_lock);
+		kcp->kcp_kcd[i] = kcd;
+		spin_unlock(&kcp->kcp_lock);
+		if (!kcd) {
+			splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+				   "Unable to allocate from '%s'\n",
+				   SPLAT_KMEM_CACHE_NAME);
 		}
 	}
 
-	for (i = 0; i < count; i++)
-		if (objs[i])
-			kmem_cache_free(kcp->kcp_cache, objs[i]);
-
-	vmem_free(objs, count * sizeof(void *));
-out:
 	spin_lock(&kcp->kcp_lock);
-	if (!kcp->kcp_rc)
-		kcp->kcp_rc = rc;
-
-	if (--kcp->kcp_threads == 0)
-	        wake_up(&kcp->kcp_waitq);
-
+	for (i = 0; i < count; i++)
+		if (kcp->kcp_kcd[i])
+			kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
 	spin_unlock(&kcp->kcp_lock);
 
-        thread_exit();
-}
+	/* We have allocated a large number of objects thus creating a
+	 * large number of slabs and then free'd them all.  However since
+	 * there should be little memory pressure at the moment those
+	 * slabs have not been freed.  What we want to see is the slab
+	 * size decrease gradually as it becomes clear they will not be
+	 * be needed.  This should be acheivable in less than minute
+	 * if it takes longer than this something has gone wrong.
+	 */
+	for (i = 0; i < 60; i++) {
+		splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+			     "%s cache objects %d, slabs %u/%u objs %u/%u mags ",
+			     SPLAT_KMEM_CACHE_NAME, kcp->kcp_count,
+			    (unsigned)kcp->kcp_cache->skc_slab_alloc,
+			    (unsigned)kcp->kcp_cache->skc_slab_total,
+			    (unsigned)kcp->kcp_cache->skc_obj_alloc,
+			    (unsigned)kcp->kcp_cache->skc_obj_total);
+
+		for_each_online_cpu(j)
+			splat_print(file, "%u/%u ",
+				     kcp->kcp_cache->skc_mag[j]->skm_avail,
+				     kcp->kcp_cache->skc_mag[j]->skm_size);
+
+		splat_print(file, "%s\n", "");
+
+		if (kcp->kcp_cache->skc_obj_total == 0)
+			break;
 
-static int
-splat_kmem_test8_count(kmem_cache_priv_t *kcp, int threads)
-{
-	int ret;
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
 
-	spin_lock(&kcp->kcp_lock);
-	ret = (kcp->kcp_threads == threads);
-	spin_unlock(&kcp->kcp_lock);
+	if (kcp->kcp_cache->skc_obj_total == 0) {
+		splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+			"Successfully created %d objects "
+			"in cache %s and reclaimed them\n",
+			count, SPLAT_KMEM_CACHE_NAME);
+	} else {
+		splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+			"Failed to reclaim %u/%d objects from cache %s\n",
+			(unsigned)kcp->kcp_cache->skc_obj_total, count,
+			SPLAT_KMEM_CACHE_NAME);
+		rc = -ENOMEM;
+	}
+
+	kmem_cache_destroy(kcp->kcp_cache);
+	splat_kmem_cache_test_kcp_free(kcp);
 
-	return ret;
+	return rc;
 }
 
-/* This test will always pass and is simply here so I can easily
- * eyeball the slab cache locking overhead to ensure it is reasonable.
+/*
+ * This test creates N threads with a shared kmem cache.  They then all
+ * concurrently allocate and free from the cache to stress the locking and
+ * concurrent cache performance.  If any one test takes longer than 5
+ * seconds to complete it is treated as a failure and may indicate a
+ * performance regression.  On my test system no one test takes more
+ * than 1 second to complete so a 5x slowdown likely a problem.
  */
 static int
-splat_kmem_test8_sc(struct file *file, void *arg, int size, int count)
+splat_kmem_test10(struct file *file, void *arg)
 {
-	kmem_cache_priv_t kcp;
-	kthread_t *thr;
-	struct timespec start, stop, delta;
-	char cache_name[32];
-	int i, j, rc = 0, threads = 32;
-
-	kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
-	kcp.kcp_file = file;
-
-        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s  %s", "name",
-	             "time (sec)\tslabs       \tobjs        \thash\n");
-        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s  %s", "",
-	             "          \ttot/max/calc\ttot/max/calc\n");
-
-	for (i = 1; i <= count; i *= 2) {
-		kcp.kcp_size = size;
-		kcp.kcp_count = 0;
-		kcp.kcp_threads = 0;
-		kcp.kcp_alloc = i;
-		kcp.kcp_rc = 0;
-	        spin_lock_init(&kcp.kcp_lock);
-	        init_waitqueue_head(&kcp.kcp_waitq);
-
-		(void)snprintf(cache_name, 32, "%s-%d-%d",
-			       SPLAT_KMEM_CACHE_NAME, size, i);
-		kcp.kcp_cache = kmem_cache_create(cache_name, kcp.kcp_size, 0,
-	                                  splat_kmem_cache_test_constructor,
-	                                  splat_kmem_cache_test_destructor,
-					  NULL, &kcp, NULL, 0);
-		if (!kcp.kcp_cache) {
-			splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
-		                     "Unable to create '%s' cache\n",
-				     SPLAT_KMEM_CACHE_NAME);
-			rc = -ENOMEM;
-			break;
-		}
-
-		start = current_kernel_time();
-
-		for (j = 0; j < threads; j++) {
-			thr = thread_create(NULL, 0, splat_kmem_test8_thread,
-			                    &kcp, 0, &p0, TS_RUN, minclsyspri);
-			if (thr == NULL) {
-				rc = -ESRCH;
-				break;
-			}
-			spin_lock(&kcp.kcp_lock);
-			kcp.kcp_threads++;
-			spin_unlock(&kcp.kcp_lock);
-		}
+	uint64_t size, alloc, free_mem, rc = 0;
 
-	        /* Sleep until the thread sets kcp.kcp_threads == 0 */
-	        wait_event(kcp.kcp_waitq, splat_kmem_test8_count(&kcp, 0));
-		stop = current_kernel_time();
-		delta = timespec_sub(stop, start);
+	free_mem = nr_free_pages() * PAGE_SIZE;
+	for (size = 16; size <= 1024*1024; size *= 2) {
 
-	        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %2ld.%09ld\t"
-			     "%lu/%lu/%lu\t%lu/%lu/%lu\n",
-			     kcp.kcp_cache->skc_name,
-			     delta.tv_sec, delta.tv_nsec,
-			     (unsigned long)kcp.kcp_cache->skc_slab_total,
-			     (unsigned long)kcp.kcp_cache->skc_slab_max,
-			     (unsigned long)(kcp.kcp_alloc * threads /
-					    SPL_KMEM_CACHE_OBJ_PER_SLAB),
-			     (unsigned long)kcp.kcp_cache->skc_obj_total,
-			     (unsigned long)kcp.kcp_cache->skc_obj_max,
-			     (unsigned long)(kcp.kcp_alloc * threads));
+		splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "name",
+			     "time (sec)\tslabs       \tobjs	\thash\n");
+		splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "",
+			     "	  \ttot/max/calc\ttot/max/calc\n");
 
-		kmem_cache_destroy(kcp.kcp_cache);
+		for (alloc = 1; alloc <= 1024; alloc *= 2) {
 
-		if (!rc && kcp.kcp_rc)
-			rc = kcp.kcp_rc;
+			/* Skip tests which exceed free memory */
+			if (size * alloc * SPLAT_KMEM_THREADS > free_mem / 2)
+				continue;
 
-		if (rc)
-			break;
+			rc = splat_kmem_cache_thread_test(file, arg,
+				SPLAT_KMEM_TEST10_NAME, size, alloc);
+			if (rc)
+				break;
+		}
 	}
 
 	return rc;
 }
 
+/*
+ * This test creates N threads with a shared kmem cache which overcommits
+ * memory by 4x.  This makes it impossible for the slab to satify the
+ * thread requirements without having its reclaim hook run which will
+ * free objects back for use.  This behavior is triggered by the linum VM
+ * detecting a low memory condition on the node and invoking the shrinkers.
+ * This should allow all the threads to complete while avoiding deadlock
+ * and for the most part out of memory events.  This is very tough on the
+ * system so it is possible the test app may get oom'ed.
+ */
 static int
-splat_kmem_test8(struct file *file, void *arg)
+splat_kmem_test11(struct file *file, void *arg)
 {
-	int i, rc = 0;
+	uint64_t size, alloc, rc;
 
-	/* Run through slab cache with objects size from
-	 * 16-1Mb in 4x multiples with 1024 objects each */
-	for (i = 16; i <= 1024*1024; i *= 4) {
-		rc = splat_kmem_test8_sc(file, arg, i, 256);
-		if (rc)
-			break;
-	}
-
-	return rc;
-}
+	size = 1024*1024;
+	alloc = ((4 * num_physpages * PAGE_SIZE) / size) / SPLAT_KMEM_THREADS;
 
-/* Validate object alignment cache behavior for caches */
-static int
-splat_kmem_test9(struct file *file, void *arg)
-{
-	char *name = SPLAT_KMEM_TEST9_NAME;
-	int i, rc;
+	splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "name",
+		     "time (sec)\tslabs       \tobjs	\thash\n");
+	splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "",
+		     "	  \ttot/max/calc\ttot/max/calc\n");
 
-	for (i = 8; i <= PAGE_SIZE; i *= 2) {
-		rc = splat_kmem_cache_test(file, arg, name, 157, i, 0);
-		if (rc)
-			return rc;
-	}
+	rc = splat_kmem_cache_thread_test(file, arg,
+		SPLAT_KMEM_TEST11_NAME, size, alloc);
 
 	return rc;
 }
@@ -701,60 +1030,66 @@ splat_kmem_test9(struct file *file, void *arg)
 splat_subsystem_t *
 splat_kmem_init(void)
 {
-        splat_subsystem_t *sub;
+	splat_subsystem_t *sub;
 
-        sub = kmalloc(sizeof(*sub), GFP_KERNEL);
-        if (sub == NULL)
-                return NULL;
+	sub = kmalloc(sizeof(*sub), GFP_KERNEL);
+	if (sub == NULL)
+		return NULL;
 
-        memset(sub, 0, sizeof(*sub));
-        strncpy(sub->desc.name, SPLAT_KMEM_NAME, SPLAT_NAME_SIZE);
+	memset(sub, 0, sizeof(*sub));
+	strncpy(sub->desc.name, SPLAT_KMEM_NAME, SPLAT_NAME_SIZE);
 	strncpy(sub->desc.desc, SPLAT_KMEM_DESC, SPLAT_DESC_SIZE);
-        INIT_LIST_HEAD(&sub->subsystem_list);
+	INIT_LIST_HEAD(&sub->subsystem_list);
 	INIT_LIST_HEAD(&sub->test_list);
-        spin_lock_init(&sub->test_lock);
-        sub->desc.id = SPLAT_SUBSYSTEM_KMEM;
-
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST1_NAME, SPLAT_KMEM_TEST1_DESC,
-	              SPLAT_KMEM_TEST1_ID, splat_kmem_test1);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST2_NAME, SPLAT_KMEM_TEST2_DESC,
-	              SPLAT_KMEM_TEST2_ID, splat_kmem_test2);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST3_NAME, SPLAT_KMEM_TEST3_DESC,
-	              SPLAT_KMEM_TEST3_ID, splat_kmem_test3);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST4_NAME, SPLAT_KMEM_TEST4_DESC,
-	              SPLAT_KMEM_TEST4_ID, splat_kmem_test4);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC,
-	              SPLAT_KMEM_TEST5_ID, splat_kmem_test5);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC,
-	              SPLAT_KMEM_TEST6_ID, splat_kmem_test6);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC,
-	              SPLAT_KMEM_TEST7_ID, splat_kmem_test7);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST8_NAME, SPLAT_KMEM_TEST8_DESC,
-	              SPLAT_KMEM_TEST8_ID, splat_kmem_test8);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST9_NAME, SPLAT_KMEM_TEST9_DESC,
-	              SPLAT_KMEM_TEST9_ID, splat_kmem_test9);
-
-        return sub;
+	spin_lock_init(&sub->test_lock);
+	sub->desc.id = SPLAT_SUBSYSTEM_KMEM;
+
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST1_NAME, SPLAT_KMEM_TEST1_DESC,
+			SPLAT_KMEM_TEST1_ID, splat_kmem_test1);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST2_NAME, SPLAT_KMEM_TEST2_DESC,
+			SPLAT_KMEM_TEST2_ID, splat_kmem_test2);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST3_NAME, SPLAT_KMEM_TEST3_DESC,
+			SPLAT_KMEM_TEST3_ID, splat_kmem_test3);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST4_NAME, SPLAT_KMEM_TEST4_DESC,
+			SPLAT_KMEM_TEST4_ID, splat_kmem_test4);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC,
+			SPLAT_KMEM_TEST5_ID, splat_kmem_test5);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC,
+			SPLAT_KMEM_TEST6_ID, splat_kmem_test6);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC,
+			SPLAT_KMEM_TEST7_ID, splat_kmem_test7);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST8_NAME, SPLAT_KMEM_TEST8_DESC,
+			SPLAT_KMEM_TEST8_ID, splat_kmem_test8);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST9_NAME, SPLAT_KMEM_TEST9_DESC,
+			SPLAT_KMEM_TEST9_ID, splat_kmem_test9);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST10_NAME, SPLAT_KMEM_TEST10_DESC,
+			SPLAT_KMEM_TEST10_ID, splat_kmem_test10);
+	SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST11_NAME, SPLAT_KMEM_TEST11_DESC,
+			SPLAT_KMEM_TEST11_ID, splat_kmem_test11);
+
+	return sub;
 }
 
 void
 splat_kmem_fini(splat_subsystem_t *sub)
 {
-        ASSERT(sub);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST9_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST8_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST2_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST1_ID);
-
-        kfree(sub);
+	ASSERT(sub);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST11_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST10_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST9_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST8_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST2_ID);
+	SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST1_ID);
+
+	kfree(sub);
 }
 
 int
 splat_kmem_id(void) {
-        return SPLAT_SUBSYSTEM_KMEM;
+	return SPLAT_SUBSYSTEM_KMEM;
 }
author	Brian Behlendorf <[email protected]>	2009-01-30 20:54:49 -0800
committer	Brian Behlendorf <[email protected]>	2009-01-30 20:54:49 -0800
commit	ea3e6ca9e595ebfba82b964ee2eaf1ddd7076f0f (patch)
tree	7480b87145297f3882ffe18234280512e136cdb4 /module
parent	34e71c9e97f4d0d2b3ede850d016a7de558b0f3c (diff)