aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2012-11-08 11:00:23 -0800
committerBrian Behlendorf <[email protected]>2012-11-08 11:09:17 -0800
commit366346c56592b1ff13020460dcbdd607c70ac7f1 (patch)
tree02a97ae4725493bdd4836389b1a732bd8403e3f8
parent65c2fc5a2ed3a60711cc63e53b3ab01e9d5095ae (diff)
parentdc1b30224f9b1587dbe383d9c8e16caa4b1f71d3 (diff)
Merge branch 'kmem-cache-optimization'
This branch contains kmem cache optimizations designed to resolve the lockups reported in zfsonlinux/zfs#922. The lockups were largely the result of spin lock contention in the slab under low memory conditions. Fundamentally, these changes are all designed to minimize that contention though a variety of methods. * Improved vmem cached deadlock detection * Track emergency objects in rbtree * Optimize spl_kmem_cache_free() * Never spin in kmem_cache_alloc() Signed-off-by: Brian Behlendorf <[email protected]> zfsonlinux/zfs#922
-rw-r--r--include/sys/kmem.h8
-rw-r--r--module/spl/spl-kmem.c164
-rw-r--r--module/spl/spl-proc.c9
3 files changed, 131 insertions, 50 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index e71a443a0..83adc8d2a 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -31,6 +31,7 @@
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/hash.h>
+#include <linux/rbtree.h>
#include <linux/ctype.h>
#include <asm/atomic.h>
#include <sys/types.h>
@@ -340,6 +341,7 @@ enum {
KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */
+ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */
@@ -366,6 +368,7 @@ typedef enum kmem_cbrc {
#define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
+#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
@@ -433,8 +436,8 @@ typedef struct spl_kmem_alloc {
} spl_kmem_alloc_t;
typedef struct spl_kmem_emergency {
+ struct rb_node ske_node; /* Emergency tree linkage */
void *ske_obj; /* Buffer address */
- struct list_head ske_list; /* Emergency list linkage */
} spl_kmem_emergency_t;
typedef struct spl_kmem_cache {
@@ -461,7 +464,7 @@ typedef struct spl_kmem_cache {
struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */
- struct list_head skc_emergency_list; /* Min sized objects */
+ struct rb_root skc_emergency_tree; /* Min sized objects */
spinlock_t skc_lock; /* Cache lock */
wait_queue_head_t skc_waitq; /* Allocation waiters */
uint64_t skc_slab_fail; /* Slab alloc failures */
@@ -473,6 +476,7 @@ typedef struct spl_kmem_cache {
uint64_t skc_obj_total; /* Obj total current */
uint64_t skc_obj_alloc; /* Obj alloc current */
uint64_t skc_obj_max; /* Obj max historic */
+ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
uint64_t skc_obj_emergency; /* Obj emergency current */
uint64_t skc_obj_emergency_max; /* Obj emergency max */
} spl_kmem_cache_t;
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index eca809c47..f3113e0f4 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
SEXIT;
}
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+ struct rb_node *node = root->rb_node;
+ spl_kmem_emergency_t *ske;
+ unsigned long address = (unsigned long)obj;
+
+ while (node) {
+ ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+ if (address < (unsigned long)ske->ske_obj)
+ node = node->rb_left;
+ else if (address > (unsigned long)ske->ske_obj)
+ node = node->rb_right;
+ else
+ return ske;
+ }
+
+ return NULL;
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ spl_kmem_emergency_t *ske_tmp;
+ unsigned long address = (unsigned long)ske->ske_obj;
+
+ while (*new) {
+ ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+ parent = *new;
+ if (address < (unsigned long)ske_tmp->ske_obj)
+ new = &((*new)->rb_left);
+ else if (address > (unsigned long)ske_tmp->ske_obj)
+ new = &((*new)->rb_right);
+ else
+ return 0;
+ }
+
+ rb_link_node(&ske->ske_node, parent, new);
+ rb_insert_color(&ske->ske_node, root);
+
+ return 1;
+}
+
/*
- * Allocate a single emergency object for use by the caller.
+ * Allocate a single emergency object and track it in a red black tree.
*/
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
@@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
SRETURN(-ENOMEM);
}
- if (skc->skc_ctor)
- skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
-
spin_lock(&skc->skc_lock);
- skc->skc_obj_total++;
- skc->skc_obj_emergency++;
- if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
- skc->skc_obj_emergency_max = skc->skc_obj_emergency;
-
- list_add(&ske->ske_list, &skc->skc_emergency_list);
+ empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+ if (likely(empty)) {
+ skc->skc_obj_total++;
+ skc->skc_obj_emergency++;
+ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+ skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+ }
spin_unlock(&skc->skc_lock);
+ if (unlikely(!empty)) {
+ kfree(ske->ske_obj);
+ kfree(ske);
+ SRETURN(-EINVAL);
+ }
+
+ if (skc->skc_ctor)
+ skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
+
*obj = ske->ske_obj;
SRETURN(0);
}
/*
- * Free the passed object if it is an emergency object or a normal slab
- * object. Currently this is done by walking what should be a short list of
- * emergency objects. If this proves to be too inefficient we can replace
- * the simple list with a hash.
+ * Locate the passed object in the red black tree and free it.
*/
static int
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
- spl_kmem_emergency_t *m, *n, *ske = NULL;
+ spl_kmem_emergency_t *ske;
SENTRY;
spin_lock(&skc->skc_lock);
- list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
- if (m->ske_obj == obj) {
- list_del(&m->ske_list);
- skc->skc_obj_emergency--;
- skc->skc_obj_total--;
- ske = m;
- break;
- }
+ ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+ if (likely(ske)) {
+ rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+ skc->skc_obj_emergency--;
+ skc->skc_obj_total--;
}
spin_unlock(&skc->skc_lock);
- if (ske == NULL)
+ if (unlikely(ske == NULL))
SRETURN(-ENOENT);
if (skc->skc_dtor)
@@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
INIT_LIST_HEAD(&skc->skc_list);
INIT_LIST_HEAD(&skc->skc_complete_list);
INIT_LIST_HEAD(&skc->skc_partial_list);
- INIT_LIST_HEAD(&skc->skc_emergency_list);
+ skc->skc_emergency_tree = RB_ROOT;
spin_lock_init(&skc->skc_lock);
init_waitqueue_head(&skc->skc_waitq);
skc->skc_slab_fail = 0;
@@ -1495,6 +1542,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_total = 0;
skc->skc_obj_alloc = 0;
skc->skc_obj_max = 0;
+ skc->skc_obj_deadlock = 0;
skc->skc_obj_emergency = 0;
skc->skc_obj_emergency_max = 0;
@@ -1589,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
ASSERT3U(skc->skc_obj_total, ==, 0);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
- ASSERT(list_empty(&skc->skc_emergency_list));
kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
@@ -1662,6 +1709,7 @@ spl_cache_grow_work(void *data)
atomic_dec(&skc->skc_ref);
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
spin_unlock(&skc->skc_lock);
@@ -1677,13 +1725,20 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc)
return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
}
+static int
+spl_cache_reclaim_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
/*
* No available objects on any slabs, create a new slab.
*/
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
- int remaining, rc = 0;
+ int remaining, rc;
SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1691,12 +1746,14 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
*obj = NULL;
/*
- * Before allocating a new slab check if the slab is being reaped.
- * If it is there is a good chance we can wait until it finishes
- * and then use one of the newly freed but not aged-out slabs.
+ * Before allocating a new slab wait for any reaping to complete and
+ * then return so the local magazine can be rechecked for new objects.
*/
- if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
- SRETURN(-EAGAIN);
+ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+ rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+ spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
+ SRETURN(rc ? rc : -EAGAIN);
+ }
/*
* This is handled by dispatching a work request to the global work
@@ -1722,17 +1779,30 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
}
/*
- * Allow a single timer tick before falling back to synchronously
- * allocating the minimum about of memory required by the caller.
+ * The goal here is to only detect the rare case where a virtual slab
+ * allocation has deadlocked. We must be careful to minimize the use
+ * of emergency objects which are more expensive to track. Therefore,
+ * we set a very long timeout for the asynchronous allocation and if
+ * the timeout is reached the cache is flagged as deadlocked. From
+ * this point only new emergency objects will be allocated until the
+ * asynchronous allocation completes and clears the deadlocked flag.
*/
- remaining = wait_event_timeout(skc->skc_waitq,
- spl_cache_grow_wait(skc), 1);
+ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+ rc = spl_emergency_alloc(skc, flags, obj);
+ } else {
+ remaining = wait_event_timeout(skc->skc_waitq,
+ spl_cache_grow_wait(skc), HZ);
- if (remaining == 0) {
- if (test_bit(KMC_BIT_NOEMERGENCY, &skc->skc_flags))
- rc = -ENOMEM;
- else
- rc = spl_emergency_alloc(skc, flags, obj);
+ if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
+ spin_lock(&skc->skc_lock);
+ if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+ set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ skc->skc_obj_deadlock++;
+ }
+ spin_unlock(&skc->skc_lock);
+ }
+
+ rc = -ENOMEM;
}
SRETURN(rc);
@@ -1962,11 +2032,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
atomic_inc(&skc->skc_ref);
/*
- * Emergency objects are never part of the virtual address space
- * so if we get a virtual address we can optimize this check out.
+ * Only virtual slabs may have emergency objects and these objects
+ * are guaranteed to have physical addresses. They must be removed
+ * from the tree of emergency objects and the freed.
*/
- if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
- SGOTO(out, 0);
+ if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
+ SGOTO(out, spl_emergency_free(skc, obj));
local_irq_save(flags);
@@ -2094,6 +2165,9 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
/* Reclaim from the cache, ignoring it's age and delay. */
spl_slab_reclaim(skc, count, 1);
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+
atomic_dec(&skc->skc_ref);
SEXIT;
diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c
index 11a2d1068..152abff7f 100644
--- a/module/spl/spl-proc.c
+++ b/module/spl/spl-proc.c
@@ -625,12 +625,14 @@ slab_seq_show_headers(struct seq_file *f)
"--------------------- cache ----------"
"--------------------------------------------- "
"----- slab ------ "
- "---- object -----------------\n");
+ "---- object ----- "
+ "--- emergency ---\n");
seq_printf(f,
"name "
" flags size alloc slabsize objsize "
"total alloc max "
- "total alloc max emerg max\n");
+ "total alloc max "
+ "dlock alloc max\n");
}
static int
@@ -643,7 +645,7 @@ slab_seq_show(struct seq_file *f, void *p)
spin_lock(&skc->skc_lock);
seq_printf(f, "%-36s ", skc->skc_name);
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
- "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
+ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
(long unsigned)skc->skc_flags,
(long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
(long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
@@ -655,6 +657,7 @@ slab_seq_show(struct seq_file *f, void *p)
(long unsigned)skc->skc_obj_total,
(long unsigned)skc->skc_obj_alloc,
(long unsigned)skc->skc_obj_max,
+ (long unsigned)skc->skc_obj_deadlock,
(long unsigned)skc->skc_obj_emergency,
(long unsigned)skc->skc_obj_emergency_max);