aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2009-02-12 13:32:10 -0800
committerBrian Behlendorf <[email protected]>2009-02-12 13:32:10 -0800
commit37db7d8cf9936e6d2851a4329c11efcd9f61305c (patch)
treef4fe4f3523de9302b80e56d3240a8c6f58f5c0e4
parentf500ccff35a55d04e7f6b8ddb54ff6e54e9c5d70 (diff)
kmem slab fixes
- Default SPL_KMEM_CACHE_DELAY changed to 15 to match Solaris. - Aged out slab checking occurs every SPL_KMEM_CACHE_DELAY / 3. - skc->skc_reap tunable added whichs allows callers of spl_slab_reclaim() to cap the number of slabs reclaimed. On Solaris all eligible slabs are always reclaimed, and this is still the default behavior. However, I suspect that is not always wise for reasons such as in the next comment. - spl_slab_reclaim() added cond_resched() while walking the slab/object free lists. Soft lockups were observed when freeing large numbers of vmalloc'd slabs/objets. - spl_slab_reclaim() 'sks->sks_ref > 0' check changes from incorrect 'break' to 'continue' to ensure all slabs are checked. - spl_cache_age() reworked to avoid a deadlock with do_flush_tlb_all() which occured because we slept waiting for completion in spl_cache_age(). To waiting for magazine reclamation to finish is not required so we no longer wait. - spl_magazine_create() and spl_magazine_destroy() shifted back to using for_each_online_cpu() instead of the spl_on_each_cpu() approach which was of course a bad idea due to memory allocations which Ricardo pointed out.
-rw-r--r--include/sys/kmem.h4
-rw-r--r--module/spl/spl-kmem.c83
2 files changed, 52 insertions, 35 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index 7281f1063..dc66a9153 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -239,7 +239,8 @@ extern struct rw_semaphore spl_kmem_cache_sem;
#define SKS_MAGIC 0x22222222
#define SKC_MAGIC 0x2c2c2c2c
-#define SPL_KMEM_CACHE_DELAY 5 /* Minimum slab release age */
+#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
+#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB 32 /* Target objects per slab */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 8 /* Minimum objects per slab */
#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
@@ -292,6 +293,7 @@ typedef struct spl_kmem_cache {
uint32_t skc_slab_objs; /* Objects per slab */
uint32_t skc_slab_size; /* Slab size */
uint32_t skc_delay; /* Slab reclaim interval */
+ uint32_t skc_reap; /* Slab reclaim count */
atomic_t skc_ref; /* Ref count callers */
struct delayed_work skc_work; /* Slab reclaim work */
struct work_struct work;
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index b5cd9fb12..d82d7b49f 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -856,16 +856,19 @@ spl_slab_free(spl_kmem_slab_t *sks,
/*
* Traverses all the partial slabs attached to a cache and free those
* which which are currently empty, and have not been touched for
- * skc_delay seconds. This is to avoid thrashing.
+ * skc_delay seconds to avoid thrashing. The count argument is
+ * passed to optionally cap the number of slabs reclaimed, a count
+ * of zero means try and reclaim everything. When flag is set we
+ * always free an available slab regardless of age.
*/
static void
-spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
+spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
{
spl_kmem_slab_t *sks, *m;
spl_kmem_obj_t *sko, *n;
LIST_HEAD(sks_list);
LIST_HEAD(sko_list);
- int size;
+ int size, i = 0;
ENTRY;
/*
@@ -878,11 +881,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
spin_lock(&skc->skc_lock);
list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
sks_list) {
+ /* Release at most count slabs */
+ if (count && i > count)
+ break;
+
+ /* Skip active slabs */
if (sks->sks_ref > 0)
- break;
+ continue;
- if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
+ if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
spl_slab_free(sks, &sks_list, &sko_list);
+ i++;
+ }
}
spin_unlock(&skc->skc_lock);
@@ -896,12 +906,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
- list_for_each_entry_safe(sko, n, &sko_list, sko_list)
+ /* To avoid soft lockups conditionally reschedule */
+ list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
kv_free(skc, sko->sko_addr, size);
+ cond_resched();
+ }
}
- list_for_each_entry_safe(sks, m, &sks_list, sks_list)
+ /* To avoid soft lockups conditionally reschedule */
+ list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
kv_free(skc, sks, skc->skc_slab_size);
+ cond_resched();
+ }
EXIT;
}
@@ -937,11 +953,11 @@ spl_cache_age(void *data)
spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
ASSERT(skc->skc_magic == SKC_MAGIC);
- spl_on_each_cpu(spl_magazine_age, skc, 1);
- spl_slab_reclaim(skc, 0);
+ spl_slab_reclaim(skc, skc->skc_reap, 0);
+ spl_on_each_cpu(spl_magazine_age, skc, 0);
if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
- schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+ schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
}
/*
@@ -1057,39 +1073,29 @@ spl_magazine_free(spl_kmem_magazine_t *skm)
EXIT;
}
-static void
-__spl_magazine_create(void *data)
-{
- spl_kmem_cache_t *skc = data;
- int id = smp_processor_id();
-
- skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
- ASSERT(skc->skc_mag[id]);
-}
-
/*
* Create all pre-cpu magazines of reasonable sizes.
*/
static int
spl_magazine_create(spl_kmem_cache_t *skc)
{
+ int i;
ENTRY;
skc->skc_mag_size = spl_magazine_size(skc);
skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
- spl_on_each_cpu(__spl_magazine_create, skc, 1);
- RETURN(0);
-}
+ for_each_online_cpu(i) {
+ skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
+ if (!skc->skc_mag[i]) {
+ for (i--; i >= 0; i--)
+ spl_magazine_free(skc->skc_mag[i]);
-static void
-__spl_magazine_destroy(void *data)
-{
- spl_kmem_cache_t *skc = data;
- spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+ RETURN(-ENOMEM);
+ }
+ }
- (void)spl_cache_flush(skc, skm, skm->skm_avail);
- spl_magazine_free(skm);
+ RETURN(0);
}
/*
@@ -1098,8 +1104,16 @@ __spl_magazine_destroy(void *data)
static void
spl_magazine_destroy(spl_kmem_cache_t *skc)
{
+ spl_kmem_magazine_t *skm;
+ int i;
ENTRY;
- spl_on_each_cpu(__spl_magazine_destroy, skc, 1);
+
+ for_each_online_cpu(i) {
+ skm = skc->skc_mag[i];
+ (void)spl_cache_flush(skc, skm, skm->skm_avail);
+ spl_magazine_free(skm);
+ }
+
EXIT;
}
@@ -1168,6 +1182,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_size = size;
skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+ skc->skc_reap = SPL_KMEM_CACHE_REAP;
atomic_set(&skc->skc_ref, 0);
INIT_LIST_HEAD(&skc->skc_list);
@@ -1209,7 +1224,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
GOTO(out, rc);
spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
- schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+ schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
down_write(&spl_kmem_cache_sem);
list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@@ -1249,7 +1264,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
wait_event(wq, atomic_read(&skc->skc_ref) == 0);
spl_magazine_destroy(skc);
- spl_slab_reclaim(skc, 1);
+ spl_slab_reclaim(skc, 0, 1);
spin_lock(&skc->skc_lock);
/* Validate there are no objects in use and free all the
@@ -1654,7 +1669,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
if (skc->skc_reclaim)
skc->skc_reclaim(skc->skc_private);
- spl_slab_reclaim(skc, 0);
+ spl_slab_reclaim(skc, skc->skc_reap, 0);
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
atomic_dec(&skc->skc_ref);