diff options
author | Brian Behlendorf <[email protected]> | 2012-11-08 11:00:23 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2012-11-08 11:09:17 -0800 |
commit | 366346c56592b1ff13020460dcbdd607c70ac7f1 (patch) | |
tree | 02a97ae4725493bdd4836389b1a732bd8403e3f8 | |
parent | 65c2fc5a2ed3a60711cc63e53b3ab01e9d5095ae (diff) | |
parent | dc1b30224f9b1587dbe383d9c8e16caa4b1f71d3 (diff) |
Merge branch 'kmem-cache-optimization'
This branch contains kmem cache optimizations designed to resolve
the lockups reported in zfsonlinux/zfs#922. The lockups were
largely the result of spin lock contention in the slab under low
memory conditions. Fundamentally, these changes are all designed
to minimize that contention though a variety of methods.
* Improved vmem cached deadlock detection
* Track emergency objects in rbtree
* Optimize spl_kmem_cache_free()
* Never spin in kmem_cache_alloc()
Signed-off-by: Brian Behlendorf <[email protected]>
zfsonlinux/zfs#922
-rw-r--r-- | include/sys/kmem.h | 8 | ||||
-rw-r--r-- | module/spl/spl-kmem.c | 164 | ||||
-rw-r--r-- | module/spl/spl-proc.c | 9 |
3 files changed, 131 insertions, 50 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h index e71a443a0..83adc8d2a 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -31,6 +31,7 @@ #include <linux/spinlock.h> #include <linux/rwsem.h> #include <linux/hash.h> +#include <linux/rbtree.h> #include <linux/ctype.h> #include <asm/atomic.h> #include <sys/types.h> @@ -340,6 +341,7 @@ enum { KMC_BIT_VMEM = 6, /* Use vmem cache */ KMC_BIT_OFFSLAB = 7, /* Objects not on slab */ KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */ + KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ KMC_BIT_DESTROY = 17, /* Destroy in progress */ @@ -366,6 +368,7 @@ typedef enum kmem_cbrc { #define KMC_VMEM (1 << KMC_BIT_VMEM) #define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) #define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) +#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) #define KMC_GROWING (1 << KMC_BIT_GROWING) #define KMC_REAPING (1 << KMC_BIT_REAPING) #define KMC_DESTROY (1 << KMC_BIT_DESTROY) @@ -433,8 +436,8 @@ typedef struct spl_kmem_alloc { } spl_kmem_alloc_t; typedef struct spl_kmem_emergency { + struct rb_node ske_node; /* Emergency tree linkage */ void *ske_obj; /* Buffer address */ - struct list_head ske_list; /* Emergency list linkage */ } spl_kmem_emergency_t; typedef struct spl_kmem_cache { @@ -461,7 +464,7 @@ typedef struct spl_kmem_cache { struct list_head skc_list; /* List of caches linkage */ struct list_head skc_complete_list;/* Completely alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */ - struct list_head skc_emergency_list; /* Min sized objects */ + struct rb_root skc_emergency_tree; /* Min sized objects */ spinlock_t skc_lock; /* Cache lock */ wait_queue_head_t skc_waitq; /* Allocation waiters */ uint64_t skc_slab_fail; /* Slab alloc failures */ @@ -473,6 +476,7 @@ typedef struct spl_kmem_cache { uint64_t skc_obj_total; /* Obj total current */ uint64_t skc_obj_alloc; /* Obj alloc current */ uint64_t skc_obj_max; /* Obj max historic */ + uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ uint64_t skc_obj_emergency; /* Obj emergency current */ uint64_t skc_obj_emergency_max; /* Obj emergency max */ } spl_kmem_cache_t; diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index eca809c47..f3113e0f4 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) SEXIT; } +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < (unsigned long)ske->ske_obj) + node = node->rb_left; + else if (address > (unsigned long)ske->ske_obj) + node = node->rb_right; + else + return ske; + } + + return NULL; +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = (unsigned long)ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < (unsigned long)ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > (unsigned long)ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return 0; + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return 1; +} + /* - * Allocate a single emergency object for use by the caller. + * Allocate a single emergency object and track it in a red black tree. */ static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) @@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) SRETURN(-ENOMEM); } - if (skc->skc_ctor) - skc->skc_ctor(ske->ske_obj, skc->skc_private, flags); - spin_lock(&skc->skc_lock); - skc->skc_obj_total++; - skc->skc_obj_emergency++; - if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) - skc->skc_obj_emergency_max = skc->skc_obj_emergency; - - list_add(&ske->ske_list, &skc->skc_emergency_list); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } spin_unlock(&skc->skc_lock); + if (unlikely(!empty)) { + kfree(ske->ske_obj); + kfree(ske); + SRETURN(-EINVAL); + } + + if (skc->skc_ctor) + skc->skc_ctor(ske->ske_obj, skc->skc_private, flags); + *obj = ske->ske_obj; SRETURN(0); } /* - * Free the passed object if it is an emergency object or a normal slab - * object. Currently this is done by walking what should be a short list of - * emergency objects. If this proves to be too inefficient we can replace - * the simple list with a hash. + * Locate the passed object in the red black tree and free it. */ static int spl_emergency_free(spl_kmem_cache_t *skc, void *obj) { - spl_kmem_emergency_t *m, *n, *ske = NULL; + spl_kmem_emergency_t *ske; SENTRY; spin_lock(&skc->skc_lock); - list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) { - if (m->ske_obj == obj) { - list_del(&m->ske_list); - skc->skc_obj_emergency--; - skc->skc_obj_total--; - ske = m; - break; - } + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (likely(ske)) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); - if (ske == NULL) + if (unlikely(ske == NULL)) SRETURN(-ENOENT); if (skc->skc_dtor) @@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_partial_list); - INIT_LIST_HEAD(&skc->skc_emergency_list); + skc->skc_emergency_tree = RB_ROOT; spin_lock_init(&skc->skc_lock); init_waitqueue_head(&skc->skc_waitq); skc->skc_slab_fail = 0; @@ -1495,6 +1542,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_obj_total = 0; skc->skc_obj_alloc = 0; skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; @@ -1589,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ASSERT3U(skc->skc_obj_total, ==, 0); ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); - ASSERT(list_empty(&skc->skc_emergency_list)); kmem_free(skc->skc_name, skc->skc_name_size); spin_unlock(&skc->skc_lock); @@ -1662,6 +1709,7 @@ spl_cache_grow_work(void *data) atomic_dec(&skc->skc_ref); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); wake_up_all(&skc->skc_waitq); spin_unlock(&skc->skc_lock); @@ -1677,13 +1725,20 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc) return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); } +static int +spl_cache_reclaim_wait(void *word) +{ + schedule(); + return 0; +} + /* * No available objects on any slabs, create a new slab. */ static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { - int remaining, rc = 0; + int remaining, rc; SENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); @@ -1691,12 +1746,14 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) *obj = NULL; /* - * Before allocating a new slab check if the slab is being reaped. - * If it is there is a good chance we can wait until it finishes - * and then use one of the newly freed but not aged-out slabs. + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. */ - if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) - SRETURN(-EAGAIN); + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE); + SRETURN(rc ? rc : -EAGAIN); + } /* * This is handled by dispatching a work request to the global work @@ -1722,17 +1779,30 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) } /* - * Allow a single timer tick before falling back to synchronously - * allocating the minimum about of memory required by the caller. + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. */ - remaining = wait_event_timeout(skc->skc_waitq, - spl_cache_grow_wait(skc), 1); + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ); - if (remaining == 0) { - if (test_bit(KMC_BIT_NOEMERGENCY, &skc->skc_flags)) - rc = -ENOMEM; - else - rc = spl_emergency_alloc(skc, flags, obj); + if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; } SRETURN(rc); @@ -1962,11 +2032,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) atomic_inc(&skc->skc_ref); /* - * Emergency objects are never part of the virtual address space - * so if we get a virtual address we can optimize this check out. + * Only virtual slabs may have emergency objects and these objects + * are guaranteed to have physical addresses. They must be removed + * from the tree of emergency objects and the freed. */ - if (!kmem_virt(obj) && !spl_emergency_free(skc, obj)) - SGOTO(out, 0); + if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) + SGOTO(out, spl_emergency_free(skc, obj)); local_irq_save(flags); @@ -2094,6 +2165,9 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) /* Reclaim from the cache, ignoring it's age and delay. */ spl_slab_reclaim(skc, count, 1); clear_bit(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); + atomic_dec(&skc->skc_ref); SEXIT; diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index 11a2d1068..152abff7f 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -625,12 +625,14 @@ slab_seq_show_headers(struct seq_file *f) "--------------------- cache ----------" "--------------------------------------------- " "----- slab ------ " - "---- object -----------------\n"); + "---- object ----- " + "--- emergency ---\n"); seq_printf(f, "name " " flags size alloc slabsize objsize " "total alloc max " - "total alloc max emerg max\n"); + "total alloc max " + "dlock alloc max\n"); } static int @@ -643,7 +645,7 @@ slab_seq_show(struct seq_file *f, void *p) spin_lock(&skc->skc_lock); seq_printf(f, "%-36s ", skc->skc_name); seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " - "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", + "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", (long unsigned)skc->skc_flags, (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), @@ -655,6 +657,7 @@ slab_seq_show(struct seq_file *f, void *p) (long unsigned)skc->skc_obj_total, (long unsigned)skc->skc_obj_alloc, (long unsigned)skc->skc_obj_max, + (long unsigned)skc->skc_obj_deadlock, (long unsigned)skc->skc_obj_emergency, (long unsigned)skc->skc_obj_emergency_max); |