aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/kmem.h17
-rw-r--r--module/spl/spl-kmem.c233
-rw-r--r--module/spl/spl-proc.c10
3 files changed, 216 insertions, 44 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index 344e2716b..aaff6d046 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -291,6 +291,7 @@ enum {
KMC_BIT_KMEM = 5, /* Use kmem cache */
KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
+ KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */
KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
@@ -315,6 +316,7 @@ typedef enum kmem_cbrc {
#define KMC_KMEM (1 << KMC_BIT_KMEM)
#define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
+#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
#define KMC_TOTAL (1 << KMC_BIT_TOTAL)
@@ -374,6 +376,17 @@ typedef struct spl_kmem_slab {
uint32_t sks_ref; /* Ref count used objects */
} spl_kmem_slab_t;
+typedef struct spl_kmem_alloc {
+ struct spl_kmem_cache *ska_cache; /* Owned by cache */
+ int ska_flags; /* Allocation flags */
+ struct delayed_work ska_work; /* Allocation work */
+} spl_kmem_alloc_t;
+
+typedef struct spl_kmem_emergency {
+ void *ske_obj; /* Buffer address */
+ struct list_head ske_list; /* Emergency list linkage */
+} spl_kmem_emergency_t;
+
typedef struct spl_kmem_cache {
uint32_t skc_magic; /* Sanity magic */
uint32_t skc_name_size; /* Name length */
@@ -398,7 +411,9 @@ typedef struct spl_kmem_cache {
struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */
+ struct list_head skc_emergency_list; /* Min sized objects */
spinlock_t skc_lock; /* Cache lock */
+ wait_queue_head_t skc_waitq; /* Allocation waiters */
uint64_t skc_slab_fail; /* Slab alloc failures */
uint64_t skc_slab_create;/* Slab creates */
uint64_t skc_slab_destroy;/* Slab destroys */
@@ -408,6 +423,8 @@ typedef struct spl_kmem_cache {
uint64_t skc_obj_total; /* Obj total current */
uint64_t skc_obj_alloc; /* Obj alloc current */
uint64_t skc_obj_max; /* Obj max historic */
+ uint64_t skc_obj_emergency; /* Obj emergency current */
+ uint64_t skc_obj_emergency_max; /* Obj emergency max */
} spl_kmem_cache_t;
#define kmem_cache_t spl_kmem_cache_t
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index 258d61478..4cf3b26ad 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -1144,6 +1144,86 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
}
/*
+ * Allocate a single emergency object for use by the caller.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ spl_kmem_emergency_t *ske;
+ int empty;
+ SENTRY;
+
+ /* Last chance use a partial slab if one now exists */
+ spin_lock(&skc->skc_lock);
+ empty = list_empty(&skc->skc_partial_list);
+ spin_unlock(&skc->skc_lock);
+ if (!empty)
+ SRETURN(-EEXIST);
+
+ ske = kmalloc(sizeof(*ske), flags);
+ if (ske == NULL)
+ SRETURN(-ENOMEM);
+
+ ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
+ if (ske->ske_obj == NULL) {
+ kfree(ske);
+ SRETURN(-ENOMEM);
+ }
+
+ if (skc->skc_ctor)
+ skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
+
+ spin_lock(&skc->skc_lock);
+ skc->skc_obj_total++;
+ skc->skc_obj_emergency++;
+ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+ skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+
+ list_add(&ske->ske_list, &skc->skc_emergency_list);
+ spin_unlock(&skc->skc_lock);
+
+ *obj = ske->ske_obj;
+
+ SRETURN(0);
+}
+
+/*
+ * Free the passed object if it is an emergency object or a normal slab
+ * object. Currently this is done by walking what should be a short list of
+ * emergency objects. If this proves to be too inefficient we can replace
+ * the simple list with a hash.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_emergency_t *m, *n, *ske = NULL;
+ SENTRY;
+
+ spin_lock(&skc->skc_lock);
+ list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
+ if (m->ske_obj == obj) {
+ list_del(&m->ske_list);
+ skc->skc_obj_emergency--;
+ skc->skc_obj_total--;
+ ske = m;
+ break;
+ }
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (ske == NULL)
+ SRETURN(-ENOENT);
+
+ if (skc->skc_dtor)
+ skc->skc_dtor(ske->ske_obj, skc->skc_private);
+
+ kfree(ske->ske_obj);
+ kfree(ske);
+
+ SRETURN(0);
+}
+
+/*
* Called regularly on all caches to age objects out of the magazines
* which have not been access in skc->skc_delay seconds. This prevents
* idle magazines from holding memory which might be better used by
@@ -1430,7 +1510,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
INIT_LIST_HEAD(&skc->skc_list);
INIT_LIST_HEAD(&skc->skc_complete_list);
INIT_LIST_HEAD(&skc->skc_partial_list);
+ INIT_LIST_HEAD(&skc->skc_emergency_list);
spin_lock_init(&skc->skc_lock);
+ init_waitqueue_head(&skc->skc_waitq);
skc->skc_slab_fail = 0;
skc->skc_slab_create = 0;
skc->skc_slab_destroy = 0;
@@ -1440,6 +1522,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_total = 0;
skc->skc_obj_alloc = 0;
skc->skc_obj_max = 0;
+ skc->skc_obj_emergency = 0;
+ skc->skc_obj_emergency_max = 0;
if (align) {
VERIFY(ISP2(align));
@@ -1530,7 +1614,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
ASSERT3U(skc->skc_obj_alloc, ==, 0);
ASSERT3U(skc->skc_slab_total, ==, 0);
ASSERT3U(skc->skc_obj_total, ==, 0);
+ ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
+ ASSERT(list_empty(&skc->skc_emergency_list));
kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
@@ -1581,59 +1667,112 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
}
/*
- * No available objects on any slabs, create a new slab. Since this
- * is an expensive operation we do it without holding the spin lock and
- * only briefly acquire it when we link in the fully allocated and
- * constructed slab.
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
*/
-static spl_kmem_slab_t *
-spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+static void
+spl_cache_grow_work(void *data)
{
+ spl_kmem_alloc_t *ska =
+ spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work);
+ spl_kmem_cache_t *skc = ska->ska_cache;
spl_kmem_slab_t *sks;
+
+ sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
+ spin_lock(&skc->skc_lock);
+ if (sks) {
+ skc->skc_slab_total++;
+ skc->skc_obj_total += sks->sks_objs;
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+ }
+
+ atomic_dec(&skc->skc_ref);
+ clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ wake_up_all(&skc->skc_waitq);
+ spin_unlock(&skc->skc_lock);
+
+ kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+ return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
+}
+
+/*
+ * No available objects on any slabs, create a new slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ int remaining, rc = 0;
SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC);
- local_irq_enable();
might_sleep();
+ *obj = NULL;
/*
* Before allocating a new slab check if the slab is being reaped.
* If it is there is a good chance we can wait until it finishes
* and then use one of the newly freed but not aged-out slabs.
*/
- if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
- schedule();
- SGOTO(out, sks= NULL);
- }
+ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
+ SRETURN(-EAGAIN);
- /* Allocate a new slab for the cache */
- sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG);
- if (sks == NULL)
- SGOTO(out, sks = NULL);
+ /*
+ * This is handled by dispatching a work request to the global work
+ * queue. This allows us to asynchronously allocate a new slab while
+ * retaining the ability to safely fall back to a smaller synchronous
+ * allocations to ensure forward progress is always maintained.
+ */
+ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+ spl_kmem_alloc_t *ska;
- /* Link the new empty slab in to the end of skc_partial_list. */
- spin_lock(&skc->skc_lock);
- skc->skc_slab_total++;
- skc->skc_obj_total += sks->sks_objs;
- list_add_tail(&sks->sks_list, &skc->skc_partial_list);
- spin_unlock(&skc->skc_lock);
-out:
- local_irq_disable();
+ ska = kmalloc(sizeof(*ska), flags);
+ if (ska == NULL) {
+ clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ wake_up_all(&skc->skc_waitq);
+ SRETURN(-ENOMEM);
+ }
- SRETURN(sks);
+ atomic_inc(&skc->skc_ref);
+ ska->ska_cache = skc;
+ ska->ska_flags = flags;
+ spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska);
+ schedule_delayed_work(&ska->ska_work, 0);
+ }
+
+ /*
+ * Allow a single timer tick before falling back to synchronously
+ * allocating the minimum about of memory required by the caller.
+ */
+ remaining = wait_event_timeout(skc->skc_waitq,
+ spl_cache_grow_wait(skc), 1);
+ if (remaining == 0)
+ rc = spl_emergency_alloc(skc, flags, obj);
+
+ SRETURN(rc);
}
/*
- * Refill a per-cpu magazine with objects from the slabs for this
- * cache. Ideally the magazine can be repopulated using existing
- * objects which have been released, however if we are unable to
- * locate enough free objects new slabs of objects will be created.
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created. On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
*/
-static int
+static void *
spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
{
spl_kmem_slab_t *sks;
- int rc = 0, refill;
+ int count = 0, rc, refill;
+ void *obj = NULL;
SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1647,8 +1786,15 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
if (list_empty(&skc->skc_partial_list)) {
spin_unlock(&skc->skc_lock);
- sks = spl_cache_grow(skc, flags);
- if (!sks)
+ local_irq_enable();
+ rc = spl_cache_grow(skc, flags, &obj);
+ local_irq_disable();
+
+ /* Emergency object for immediate use by caller */
+ if (rc == 0 && obj != NULL)
+ SRETURN(obj);
+
+ if (rc)
SGOTO(out, rc);
/* Rescheduled to different CPU skm is not local */
@@ -1673,9 +1819,9 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
/* Consume as many objects as needed to refill the requested
* cache. We must also be careful not to overfill it. */
- while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
+ while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
ASSERT(skm->skm_avail < skm->skm_size);
- ASSERT(rc < skm->skm_size);
+ ASSERT(count < skm->skm_size);
skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
}
@@ -1688,8 +1834,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
spin_unlock(&skc->skc_lock);
out:
- /* Returns the number of entries added to cache */
- SRETURN(rc);
+ SRETURN(NULL);
}
/*
@@ -1804,10 +1949,9 @@ restart:
obj = skm->skm_objs[--skm->skm_avail];
skm->skm_age = jiffies;
} else {
- /* Per-CPU cache empty, directly allocate from
- * the slab and refill the per-CPU cache. */
- (void)spl_cache_refill(skc, skm, flags);
- SGOTO(restart, obj = NULL);
+ obj = spl_cache_refill(skc, skm, flags);
+ if (obj == NULL)
+ SGOTO(restart, obj = NULL);
}
local_irq_restore(irq_flags);
@@ -1838,6 +1982,14 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
atomic_inc(&skc->skc_ref);
+
+ /*
+ * Emergency objects are never part of the virtual address space
+ * so if we get a virtual address we can optimize this check out.
+ */
+ if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
+ SGOTO(out, 0);
+
local_irq_save(flags);
/* Safe to update per-cpu structure without lock, but
@@ -1855,6 +2007,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
skm->skm_objs[skm->skm_avail++] = obj;
local_irq_restore(flags);
+out:
atomic_dec(&skc->skc_ref);
SEXIT;
diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c
index 8149143ae..11a2d1068 100644
--- a/module/spl/spl-proc.c
+++ b/module/spl/spl-proc.c
@@ -625,12 +625,12 @@ slab_seq_show_headers(struct seq_file *f)
"--------------------- cache ----------"
"--------------------------------------------- "
"----- slab ------ "
- "---- object -----\n");
+ "---- object -----------------\n");
seq_printf(f,
"name "
" flags size alloc slabsize objsize "
"total alloc max "
- "total alloc max\n");
+ "total alloc max emerg max\n");
}
static int
@@ -643,7 +643,7 @@ slab_seq_show(struct seq_file *f, void *p)
spin_lock(&skc->skc_lock);
seq_printf(f, "%-36s ", skc->skc_name);
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
- "%5lu %5lu %5lu %5lu %5lu %5lu\n",
+ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
(long unsigned)skc->skc_flags,
(long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
(long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
@@ -654,7 +654,9 @@ slab_seq_show(struct seq_file *f, void *p)
(long unsigned)skc->skc_slab_max,
(long unsigned)skc->skc_obj_total,
(long unsigned)skc->skc_obj_alloc,
- (long unsigned)skc->skc_obj_max);
+ (long unsigned)skc->skc_obj_max,
+ (long unsigned)skc->skc_obj_emergency,
+ (long unsigned)skc->skc_obj_emergency_max);
spin_unlock(&skc->skc_lock);