diff options
-rw-r--r-- | include/sys/kmem.h | 17 | ||||
-rw-r--r-- | module/spl/spl-kmem.c | 233 | ||||
-rw-r--r-- | module/spl/spl-proc.c | 10 |
3 files changed, 216 insertions, 44 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 344e2716b..aaff6d046 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -291,6 +291,7 @@ enum { KMC_BIT_KMEM = 5, /* Use kmem cache */ KMC_BIT_VMEM = 6, /* Use vmem cache */ KMC_BIT_OFFSLAB = 7, /* Objects not on slab */ + KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ KMC_BIT_DESTROY = 17, /* Destroy in progress */ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ @@ -315,6 +316,7 @@ typedef enum kmem_cbrc { #define KMC_KMEM (1 << KMC_BIT_KMEM) #define KMC_VMEM (1 << KMC_BIT_VMEM) #define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) +#define KMC_GROWING (1 << KMC_BIT_GROWING) #define KMC_REAPING (1 << KMC_BIT_REAPING) #define KMC_DESTROY (1 << KMC_BIT_DESTROY) #define KMC_TOTAL (1 << KMC_BIT_TOTAL) @@ -374,6 +376,17 @@ typedef struct spl_kmem_slab { uint32_t sks_ref; /* Ref count used objects */ } spl_kmem_slab_t; +typedef struct spl_kmem_alloc { + struct spl_kmem_cache *ska_cache; /* Owned by cache */ + int ska_flags; /* Allocation flags */ + struct delayed_work ska_work; /* Allocation work */ +} spl_kmem_alloc_t; + +typedef struct spl_kmem_emergency { + void *ske_obj; /* Buffer address */ + struct list_head ske_list; /* Emergency list linkage */ +} spl_kmem_emergency_t; + typedef struct spl_kmem_cache { uint32_t skc_magic; /* Sanity magic */ uint32_t skc_name_size; /* Name length */ @@ -398,7 +411,9 @@ typedef struct spl_kmem_cache { struct list_head skc_list; /* List of caches linkage */ struct list_head skc_complete_list;/* Completely alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */ + struct list_head skc_emergency_list; /* Min sized objects */ spinlock_t skc_lock; /* Cache lock */ + wait_queue_head_t skc_waitq; /* Allocation waiters */ uint64_t skc_slab_fail; /* Slab alloc failures */ uint64_t skc_slab_create;/* Slab creates */ uint64_t skc_slab_destroy;/* Slab destroys */ @@ -408,6 +423,8 @@ typedef struct spl_kmem_cache { uint64_t skc_obj_total; /* Obj total current */ uint64_t skc_obj_alloc; /* Obj alloc current */ uint64_t skc_obj_max; /* Obj max historic */ + uint64_t skc_obj_emergency; /* Obj emergency current */ + uint64_t skc_obj_emergency_max; /* Obj emergency max */ } spl_kmem_cache_t; #define kmem_cache_t spl_kmem_cache_t diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 258d61478..4cf3b26ad 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -1144,6 +1144,86 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) } /* + * Allocate a single emergency object for use by the caller. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + spl_kmem_emergency_t *ske; + int empty; + SENTRY; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + SRETURN(-EEXIST); + + ske = kmalloc(sizeof(*ske), flags); + if (ske == NULL) + SRETURN(-ENOMEM); + + ske->ske_obj = kmalloc(skc->skc_obj_size, flags); + if (ske->ske_obj == NULL) { + kfree(ske); + SRETURN(-ENOMEM); + } + + if (skc->skc_ctor) + skc->skc_ctor(ske->ske_obj, skc->skc_private, flags); + + spin_lock(&skc->skc_lock); + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + + list_add(&ske->ske_list, &skc->skc_emergency_list); + spin_unlock(&skc->skc_lock); + + *obj = ske->ske_obj; + + SRETURN(0); +} + +/* + * Free the passed object if it is an emergency object or a normal slab + * object. Currently this is done by walking what should be a short list of + * emergency objects. If this proves to be too inefficient we can replace + * the simple list with a hash. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *m, *n, *ske = NULL; + SENTRY; + + spin_lock(&skc->skc_lock); + list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) { + if (m->ske_obj == obj) { + list_del(&m->ske_list); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + ske = m; + break; + } + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + SRETURN(-ENOENT); + + if (skc->skc_dtor) + skc->skc_dtor(ske->ske_obj, skc->skc_private); + + kfree(ske->ske_obj); + kfree(ske); + + SRETURN(0); +} + +/* * Called regularly on all caches to age objects out of the magazines * which have not been access in skc->skc_delay seconds. This prevents * idle magazines from holding memory which might be better used by @@ -1430,7 +1510,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_partial_list); + INIT_LIST_HEAD(&skc->skc_emergency_list); spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); skc->skc_slab_fail = 0; skc->skc_slab_create = 0; skc->skc_slab_destroy = 0; @@ -1440,6 +1522,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_obj_total = 0; skc->skc_obj_alloc = 0; skc->skc_obj_max = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; if (align) { VERIFY(ISP2(align)); @@ -1530,7 +1614,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0); ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); + ASSERT(list_empty(&skc->skc_emergency_list)); kmem_free(skc->skc_name, skc->skc_name_size); spin_unlock(&skc->skc_lock); @@ -1581,59 +1667,112 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) } /* - * No available objects on any slabs, create a new slab. Since this - * is an expensive operation we do it without holding the spin lock and - * only briefly acquire it when we link in the fully allocated and - * constructed slab. + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. */ -static spl_kmem_slab_t * -spl_cache_grow(spl_kmem_cache_t *skc, int flags) +static void +spl_cache_grow_work(void *data) { + spl_kmem_alloc_t *ska = + spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work); + spl_kmem_cache_t *skc = ska->ska_cache; spl_kmem_slab_t *sks; + + sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + } + + atomic_dec(&skc->skc_ref); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + wake_up_all(&skc->skc_waitq); + spin_unlock(&skc->skc_lock); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); +} + +/* + * No available objects on any slabs, create a new slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; SENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); - local_irq_enable(); might_sleep(); + *obj = NULL; /* * Before allocating a new slab check if the slab is being reaped. * If it is there is a good chance we can wait until it finishes * and then use one of the newly freed but not aged-out slabs. */ - if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { - schedule(); - SGOTO(out, sks= NULL); - } + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) + SRETURN(-EAGAIN); - /* Allocate a new slab for the cache */ - sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG); - if (sks == NULL) - SGOTO(out, sks = NULL); + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; - /* Link the new empty slab in to the end of skc_partial_list. */ - spin_lock(&skc->skc_lock); - skc->skc_slab_total++; - skc->skc_obj_total += sks->sks_objs; - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - spin_unlock(&skc->skc_lock); -out: - local_irq_disable(); + ska = kmalloc(sizeof(*ska), flags); + if (ska == NULL) { + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + wake_up_all(&skc->skc_waitq); + SRETURN(-ENOMEM); + } - SRETURN(sks); + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska); + schedule_delayed_work(&ska->ska_work, 0); + } + + /* + * Allow a single timer tick before falling back to synchronously + * allocating the minimum about of memory required by the caller. + */ + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), 1); + if (remaining == 0) + rc = spl_emergency_alloc(skc, flags, obj); + + SRETURN(rc); } /* - * Refill a per-cpu magazine with objects from the slabs for this - * cache. Ideally the magazine can be repopulated using existing - * objects which have been released, however if we are unable to - * locate enough free objects new slabs of objects will be created. + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. */ -static int +static void * spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) { spl_kmem_slab_t *sks; - int rc = 0, refill; + int count = 0, rc, refill; + void *obj = NULL; SENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); @@ -1647,8 +1786,15 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) if (list_empty(&skc->skc_partial_list)) { spin_unlock(&skc->skc_lock); - sks = spl_cache_grow(skc, flags); - if (!sks) + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + SRETURN(obj); + + if (rc) SGOTO(out, rc); /* Rescheduled to different CPU skm is not local */ @@ -1673,9 +1819,9 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) /* Consume as many objects as needed to refill the requested * cache. We must also be careful not to overfill it. */ - while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) { + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { ASSERT(skm->skm_avail < skm->skm_size); - ASSERT(rc < skm->skm_size); + ASSERT(count < skm->skm_size); skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); } @@ -1688,8 +1834,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) spin_unlock(&skc->skc_lock); out: - /* Returns the number of entries added to cache */ - SRETURN(rc); + SRETURN(NULL); } /* @@ -1804,10 +1949,9 @@ restart: obj = skm->skm_objs[--skm->skm_avail]; skm->skm_age = jiffies; } else { - /* Per-CPU cache empty, directly allocate from - * the slab and refill the per-CPU cache. */ - (void)spl_cache_refill(skc, skm, flags); - SGOTO(restart, obj = NULL); + obj = spl_cache_refill(skc, skm, flags); + if (obj == NULL) + SGOTO(restart, obj = NULL); } local_irq_restore(irq_flags); @@ -1838,6 +1982,14 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); atomic_inc(&skc->skc_ref); + + /* + * Emergency objects are never part of the virtual address space + * so if we get a virtual address we can optimize this check out. + */ + if (!kmem_virt(obj) && !spl_emergency_free(skc, obj)) + SGOTO(out, 0); + local_irq_save(flags); /* Safe to update per-cpu structure without lock, but @@ -1855,6 +2007,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) skm->skm_objs[skm->skm_avail++] = obj; local_irq_restore(flags); +out: atomic_dec(&skc->skc_ref); SEXIT; diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index 8149143ae..11a2d1068 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -625,12 +625,12 @@ slab_seq_show_headers(struct seq_file *f) "--------------------- cache ----------" "--------------------------------------------- " "----- slab ------ " - "---- object -----\n"); + "---- object -----------------\n"); seq_printf(f, "name " " flags size alloc slabsize objsize " "total alloc max " - "total alloc max\n"); + "total alloc max emerg max\n"); } static int @@ -643,7 +643,7 @@ slab_seq_show(struct seq_file *f, void *p) spin_lock(&skc->skc_lock); seq_printf(f, "%-36s ", skc->skc_name); seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " - "%5lu %5lu %5lu %5lu %5lu %5lu\n", + "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", (long unsigned)skc->skc_flags, (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), @@ -654,7 +654,9 @@ slab_seq_show(struct seq_file *f, void *p) (long unsigned)skc->skc_slab_max, (long unsigned)skc->skc_obj_total, (long unsigned)skc->skc_obj_alloc, - (long unsigned)skc->skc_obj_max); + (long unsigned)skc->skc_obj_max, + (long unsigned)skc->skc_obj_emergency, + (long unsigned)skc->skc_obj_emergency_max); spin_unlock(&skc->skc_lock); |