diff options
author | behlendo <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c> | 2008-06-13 23:41:06 +0000 |
---|---|---|
committer | behlendo <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c> | 2008-06-13 23:41:06 +0000 |
commit | 2fb9b26a85ce63dab740821382f1208c8fed6262 (patch) | |
tree | 5f075860b82f9180add51b05565331c135fb3c2e | |
parent | cfe5749941bafbc0bfc069aff00fa0b930d741fa (diff) |
* : modules/sys/kmem-slab.c : Re-implemented the slab to no
longer be based on the linux slab but to be its own complete
implementation. The new slab behaves much more like the
Solaris slab than the Linux slab.
git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@132 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | include/sys/kmem.h | 119 | ||||
-rw-r--r-- | include/sys/types.h | 1 | ||||
-rw-r--r-- | modules/spl/spl-generic.c | 7 | ||||
-rw-r--r-- | modules/spl/spl-kmem.c | 989 | ||||
-rw-r--r-- | modules/spl/spl-vnode.c | 11 | ||||
-rw-r--r-- | modules/splat/splat-kmem.c | 310 |
7 files changed, 875 insertions, 569 deletions
@@ -1,3 +1,10 @@ +2008-06-13 Brian Behlendorf <[email protected]> + + * : modules/sys/kmem-slab.c : Re-implemented the slab to no + longer be based on the linux slab but to be it's own complete + implementation. The new slab behaves much more like the + Solaris slab than the Linux slab. + 2008-06-04 Brian Behlendorf <[email protected]> * : Tag spl-0.3.2 diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 0b1b53687..082db032a 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -308,11 +308,11 @@ kmem_alloc_tryhard(size_t size, size_t *alloc_size, int kmflags) /* * Slab allocation interfaces */ -#undef KMC_NOTOUCH /* No linux analog */ +#undef KMC_NOTOUCH /* XXX: Unsupported */ #define KMC_NODEBUG 0x00000000 /* Default behavior */ -#define KMC_NOMAGAZINE /* No linux analog */ -#define KMC_NOHASH /* No linux analog */ -#define KMC_QCACHE /* No linux analog */ +#define KMC_NOMAGAZINE /* XXX: Unsupported */ +#define KMC_NOHASH /* XXX: Unsupported */ +#define KMC_QCACHE /* XXX: Unsupported */ #define KMC_REAP_CHUNK 256 #define KMC_DEFAULT_SEEKS DEFAULT_SEEKS @@ -342,7 +342,7 @@ static __inline__ size_t kmem_maxavail(void) { #error "kmem_maxavail() not implemented" } -static __inline__ uint64_t kmem_cache_stat(kmem_cache_t *cache) { +static __inline__ uint64_t kmem_cache_stat(spl_kmem_cache_t *cache) { #error "kmem_cache_stat() not implemented" } #endif /* DEBUG_KMEM_UNIMPLEMENTED */ @@ -357,34 +357,101 @@ kmem_debugging(void) return 0; } -typedef int (*kmem_constructor_t)(void *, void *, int); -typedef void (*kmem_destructor_t)(void *, void *); -typedef void (*kmem_reclaim_t)(void *); - extern int kmem_set_warning(int flag); -extern kmem_cache_t * -__kmem_cache_create(char *name, size_t size, size_t align, - kmem_constructor_t constructor, - kmem_destructor_t destructor, - kmem_reclaim_t reclaim, + +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c + +#define SPL_KMEM_CACHE_HASH_BITS 12 /* 4k, sized for 1000's of objs */ +#define SPL_KMEM_CACHE_HASH_ELTS (1 << SPL_KMEM_CACHE_HASH_BITS) +#define SPL_KMEM_CACHE_HASH_SIZE (sizeof(struct hlist_head) * \ + SPL_KMEM_CACHE_HASH_ELTS) + +#define SPL_KMEM_CACHE_DELAY 5 +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 32 + +typedef int (*spl_kmem_ctor_t)(void *, void *, int); +typedef void (*spl_kmem_dtor_t)(void *, void *); +typedef void (*spl_kmem_reclaim_t)(void *); + +typedef struct spl_kmem_obj { + uint32_t sko_magic; /* Sanity magic */ + uint32_t sko_flags; /* Per object flags */ + void *sko_addr; /* Buffer address */ + struct spl_kmem_slab *sko_slab; /* Owned by slab */ + struct list_head sko_list; /* Free object list linkage */ + struct hlist_node sko_hlist; /* Used object hash linkage */ +} spl_kmem_obj_t; + +typedef struct spl_kmem_slab { + uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_objs; /* Objects per slab */ + struct spl_kmem_cache *sks_cache; /* Owned by cache */ + struct list_head sks_list; /* Slab list linkage */ + struct list_head sks_free_list; /* Free object list */ + unsigned long sks_age; /* Last modify jiffie */ + atomic_t sks_ref; /* Ref count used objects */ +} spl_kmem_slab_t; + +typedef struct spl_kmem_cache { + uint32_t skc_magic; /* Sanity magic */ + uint32_t skc_name_size; /* Name length */ + char *skc_name; /* Name string */ + spl_kmem_ctor_t skc_ctor; /* Constructor */ + spl_kmem_dtor_t skc_dtor; /* Destructor */ + spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ + void *skc_private; /* Private data */ + void *skc_vmp; /* Unused */ + uint32_t skc_flags; /* Flags */ + uint32_t skc_obj_size; /* Object size */ + uint32_t skc_chunk_size; /* sizeof(*obj) + alignment */ + uint32_t skc_slab_size; /* slab size */ + uint32_t skc_max_chunks; /* max chunks per slab */ + uint32_t skc_delay; /* slab reclaim interval */ + uint32_t skc_hash_bits; /* Hash table bits */ + uint32_t skc_hash_size; /* Hash table size */ + uint32_t skc_hash_elts; /* Hash table elements */ + struct hlist_head *skc_hash; /* Hash table address */ + struct list_head skc_list; /* List of caches linkage */ + struct list_head skc_complete_list;/* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ + struct rw_semaphore skc_sem; /* Cache semaphore */ + uint64_t skc_slab_fail; /* Slab alloc failures */ + uint64_t skc_slab_create;/* Slab creates */ + uint64_t skc_slab_destroy;/* Slab destroys */ + uint64_t skc_slab_total; /* Slab total */ + uint64_t skc_slab_alloc; /* Slab alloc */ + uint64_t skc_slab_max; /* Slab max */ + uint64_t skc_obj_total; /* Obj total */ + uint64_t skc_obj_alloc; /* Obj alloc */ + uint64_t skc_obj_max; /* Obj max */ + uint64_t skc_hash_depth; /* Hash depth */ + uint64_t skc_hash_max; /* Hash depth max */ +} spl_kmem_cache_t; + +extern spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); -extern int __kmem_cache_destroy(kmem_cache_t *cache); -extern void *__kmem_cache_alloc(kmem_cache_t *cache, gfp_t flags); -extern void __kmem_cache_free(kmem_cache_t *cache, void *obj); -extern void __kmem_reap(void); +extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); +extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); +extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc); +extern void spl_kmem_reap(void); -int kmem_init(void); -void kmem_fini(void); +int spl_kmem_init(void); +void spl_kmem_fini(void); #define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ - __kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) -#define kmem_cache_destroy(cache) __kmem_cache_destroy(cache) -#define kmem_cache_alloc(cache, flags) __kmem_cache_alloc(cache, flags) -#define kmem_cache_free(cache, obj) __kmem_cache_free(cache, obj) -#define kmem_cache_reap_now(cache) kmem_cache_shrink(cache) -#define kmem_reap() __kmem_reap() + spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) +#define kmem_reap() spl_kmem_reap() #ifdef __cplusplus } diff --git a/include/sys/types.h b/include/sys/types.h index c2ad9837d..c60bfb208 100644 --- a/include/sys/types.h +++ b/include/sys/types.h @@ -16,6 +16,7 @@ typedef unsigned long uintptr_t; #ifndef HAVE_KMEM_CACHE_T typedef struct kmem_cache kmem_cache_t; #endif +#define kmem_cache_t spl_kmem_cache_t typedef enum { B_FALSE=0, B_TRUE=1 } boolean_t; typedef unsigned long intptr_t; diff --git a/modules/spl/spl-generic.c b/modules/spl/spl-generic.c index 3027c7108..7a073ee52 100644 --- a/modules/spl/spl-generic.c +++ b/modules/spl/spl-generic.c @@ -130,7 +130,7 @@ static int __init spl_init(void) if ((rc = debug_init())) return rc; - if ((rc = kmem_init())) + if ((rc = spl_kmem_init())) GOTO(out , rc); if ((rc = spl_mutex_init())) @@ -159,7 +159,7 @@ out4: out3: spl_mutex_fini(); out2: - kmem_fini(); + spl_kmem_fini(); out: debug_fini(); @@ -176,7 +176,8 @@ static void spl_fini(void) kstat_fini(); proc_fini(); vn_fini(); - kmem_fini(); + spl_mutex_fini(); + spl_kmem_fini(); debug_fini(); } diff --git a/modules/spl/spl-kmem.c b/modules/spl/spl-kmem.c index d7643067a..e52f19935 100644 --- a/modules/spl/spl-kmem.c +++ b/modules/spl/spl-kmem.c @@ -33,7 +33,13 @@ #define DEBUG_SUBSYSTEM S_KMEM /* - * Memory allocation interfaces + * Memory allocation interfaces and debugging for basic kmem_* + * and vmem_* style memory allocation. When DEBUG_KMEM is enable + * all allocations will be tracked when they are allocated and + * freed. When the SPL module is unload a list of all leaked + * addresses and where they were allocated will be dumped to the + * console. Enabling this feature has a significant impant on + * performance but it makes finding memory leaks staight forward. */ #ifdef DEBUG_KMEM /* Shim layer memory accounting */ @@ -75,477 +81,590 @@ EXPORT_SYMBOL(kmem_set_warning); /* * Slab allocation interfaces * - * While the linux slab implementation was inspired by solaris they - * have made some changes to the API which complicates this shim - * layer. For one thing the same symbol names are used with different - * arguments for the prototypes. To deal with this we must use the - * preprocessor to re-order arguments. Happily for us standard C says, - * "Macro's appearing in their own expansion are not reexpanded" so - * this does not result in an infinite recursion. Additionally the - * function pointers registered by solarias differ from those used - * by linux so a lookup and mapping from linux style callback to a - * solaris style callback is needed. There is some overhead in this - * operation which isn't horibile but it needs to be kept in mind. + * While the Linux slab implementation was inspired by the Solaris + * implemenation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleaner for these data types unlike + * may Linux data type which do need to be explicitly destroyed. + * + * 2) Virtual address backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contigeous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + * + * XXX: Refactor the below code in to smaller functions. This works + * for a first pass but each function is doing to much. + * + * XXX: Implement SPL proc interface to export full per cache stats. + * + * XXX: Implement work requests to keep an eye on each cache and + * shrink them via slab_reclaim() when they are wasting lots + * of space. Currently this process is driven by the reapers. + * + * XXX: Implement proper small cache object support by embedding + * the spl_kmem_slab_t, spl_kmem_obj_t's, and objects in the + * allocated for a particular slab. + * + * XXX: Implement a resizable used object hash. Currently the hash + * is statically sized for thousands of objects but it should + * grow based on observed worst case slab depth. + * + * XXX: Improve the partial slab list by carefully maintaining a + * strict ordering of fullest to emptiest slabs based on + * the slab reference count. This gaurentees the when freeing + * slabs back to the system we need only linearly traverse the + * last N slabs in the list to discover all the freeable slabs. + * + * XXX: NUMA awareness for optionally allocating memory close to a + * particular core. This can be adventageous if you know the slab + * object will be short lived and primarily accessed from one core. + * + * XXX: Slab coloring may also yield performance improvements and would + * be desirable to implement. */ -#define KCC_MAGIC 0x7a7a7a7a -#define KCC_POISON 0x77 - -typedef struct kmem_cache_cb { - int kcc_magic; - struct hlist_node kcc_hlist; - struct list_head kcc_list; - kmem_cache_t * kcc_cache; - kmem_constructor_t kcc_constructor; - kmem_destructor_t kcc_destructor; - kmem_reclaim_t kcc_reclaim; - void * kcc_private; - void * kcc_vmp; - atomic_t kcc_ref; -} kmem_cache_cb_t; - -#define KMEM_CACHE_HASH_BITS 10 -#define KMEM_CACHE_TABLE_SIZE (1 << KMEM_CACHE_HASH_BITS) - -struct hlist_head kmem_cache_table[KMEM_CACHE_TABLE_SIZE]; -struct list_head kmem_cache_list; -static struct rw_semaphore kmem_cache_sem; + +/* Ensure the __kmem_cache_create/__kmem_cache_destroy macros are + * removed here to prevent a recursive substitution, we want to call + * the native linux version. + */ +#undef kmem_cache_t +#undef kmem_cache_create +#undef kmem_cache_destroy +#undef kmem_cache_alloc +#undef kmem_cache_free + +static struct list_head spl_kmem_cache_list; /* List of caches */ +static struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +static kmem_cache_t *spl_slab_cache; /* Cache for slab structs */ +static kmem_cache_t *spl_obj_cache; /* Cache for obj structs */ #ifdef HAVE_SET_SHRINKER -static struct shrinker *kmem_cache_shrinker; +static struct shrinker *spl_kmem_cache_shrinker; #else static int kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask); -static struct shrinker kmem_cache_shrinker = { +static struct shrinker spl_kmem_cache_shrinker = { .shrink = kmem_cache_generic_shrinker, .seeks = KMC_DEFAULT_SEEKS, }; #endif -/* Function must be called while holding the kmem_cache_sem - * Because kmem_cache_t is an opaque datatype we're forced to - * match pointers to identify specific cache entires. - */ -static kmem_cache_cb_t * -kmem_cache_find_cache_cb(kmem_cache_t *cache) -{ - struct hlist_head *head; - struct hlist_node *node; - kmem_cache_cb_t *kcc; -#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK - ASSERT(rwsem_is_locked(&kmem_cache_sem)); -#endif - - head = &kmem_cache_table[hash_ptr(cache, KMEM_CACHE_HASH_BITS)]; - hlist_for_each_entry_rcu(kcc, node, head, kcc_hlist) - if (kcc->kcc_cache == cache) - return kcc; +static spl_kmem_slab_t * +slab_alloc(spl_kmem_cache_t *skc, int flags) { + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + int i; + ENTRY; - return NULL; -} + sks = kmem_cache_alloc(spl_slab_cache, flags); + if (sks == NULL) + RETURN(sks); + + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + atomic_set(&sks->sks_ref, 0); + + for (i = 0; i < sks->sks_objs; i++) { + sko = kmem_cache_alloc(spl_obj_cache, flags); + if (sko == NULL) { +out_alloc: + /* Unable to fully construct slab, objects, + * and object data buffers unwind everything. + */ + list_for_each_entry_safe(sko, n, &sks->sks_free_list, + sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + vmem_free(sko->sko_addr, skc->skc_obj_size); + list_del(&sko->sko_list); + kmem_cache_free(spl_obj_cache, sko); + } + + kmem_cache_free(spl_slab_cache, sks); + GOTO(out, sks = NULL); + } -static kmem_cache_cb_t * -kmem_cache_add_cache_cb(kmem_cache_t *cache, - kmem_constructor_t constructor, - kmem_destructor_t destructor, - kmem_reclaim_t reclaim, - void *priv, void *vmp) -{ - kmem_cache_cb_t *kcc; - - kcc = (kmem_cache_cb_t *)kmalloc(sizeof(*kcc), GFP_KERNEL); - if (kcc) { - kcc->kcc_magic = KCC_MAGIC; - kcc->kcc_cache = cache; - kcc->kcc_constructor = constructor; - kcc->kcc_destructor = destructor; - kcc->kcc_reclaim = reclaim; - kcc->kcc_private = priv; - kcc->kcc_vmp = vmp; - atomic_set(&kcc->kcc_ref, 0); - down_write(&kmem_cache_sem); - hlist_add_head_rcu(&kcc->kcc_hlist, &kmem_cache_table[ - hash_ptr(cache, KMEM_CACHE_HASH_BITS)]); - list_add_tail(&kcc->kcc_list, &kmem_cache_list); - up_write(&kmem_cache_sem); - } - - return kcc; -} + sko->sko_addr = vmem_alloc(skc->skc_obj_size, flags); + if (sko->sko_addr == NULL) { + kmem_cache_free(spl_obj_cache, sko); + GOTO(out_alloc, sks = NULL); + } -static void -kmem_cache_remove_cache_cb(kmem_cache_cb_t *kcc) -{ - down_write(&kmem_cache_sem); - ASSERT(atomic_read(&kcc->kcc_ref) == 0); - hlist_del_init(&kcc->kcc_hlist); - list_del_init(&kcc->kcc_list); - up_write(&kmem_cache_sem); - - if (kcc) { - memset(kcc, KCC_POISON, sizeof(*kcc)); - kfree(kcc); + sko->sko_magic = SKO_MAGIC; + sko->sko_flags = 0; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + INIT_HLIST_NODE(&sko->sko_hlist); + list_add(&sko->sko_list, &sks->sks_free_list); } +out: + RETURN(sks); } -#ifdef HAVE_3ARG_KMEM_CACHE_CREATE_CTOR +/* Removes slab from complete or partial list, so it must + * be called with the 'skc->skc_sem' semaphore held. + * */ static void -kmem_cache_generic_constructor(void *ptr, kmem_cache_t *cache, - unsigned long flags) -{ - kmem_cache_cb_t *kcc; - kmem_constructor_t constructor; - void *private; - - /* Ensure constructor verifies are not passed to the registered - * constructors. This may not be safe due to the Solaris constructor - * not being aware of how to handle the SLAB_CTOR_VERIFY flag - */ - ASSERT(flags & SLAB_CTOR_CONSTRUCTOR); +slab_free(spl_kmem_slab_t *sks) { + spl_kmem_cache_t *skc; + spl_kmem_obj_t *sko, *n; + int i = 0; + ENTRY; - if (flags & SLAB_CTOR_VERIFY) - return; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(atomic_read(&sks->sks_ref) == 0); + skc = sks->sks_cache; + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; - if (flags & SLAB_CTOR_ATOMIC) - flags = KM_NOSLEEP; - else - flags = KM_SLEEP; -#else -static void -kmem_cache_generic_constructor(kmem_cache_t *cache, void *ptr) -{ - kmem_cache_cb_t *kcc; - kmem_constructor_t constructor; - void *private; - int flags = KM_NOSLEEP; +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + ASSERT(rwsem_is_locked(&skc->skc_sem)); #endif - /* We can be called with interrupts disabled so it is critical that - * this function and the registered constructor never sleep. - */ - while (!down_read_trylock(&kmem_cache_sem)); - /* Callback list must be in sync with linux slab caches */ - kcc = kmem_cache_find_cache_cb(cache); - ASSERT(kcc); - ASSERT(kcc->kcc_magic == KCC_MAGIC); - atomic_inc(&kcc->kcc_ref); + list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); - constructor = kcc->kcc_constructor; - private = kcc->kcc_private; + /* Run destructors for being freed */ + if (skc->skc_dtor) + skc->skc_dtor(sko->sko_addr, skc->skc_private); - up_read(&kmem_cache_sem); + vmem_free(sko->sko_addr, skc->skc_obj_size); + list_del(&sko->sko_list); + kmem_cache_free(spl_obj_cache, sko); + i++; + } - if (constructor) - constructor(ptr, private, (int)flags); + ASSERT(sks->sks_objs == i); + list_del(&sks->sks_list); + kmem_cache_free(spl_slab_cache, sks); - atomic_dec(&kcc->kcc_ref); + EXIT; +} - /* Linux constructor has no return code, silently eat it */ +static int +__slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + int rc = 0; + ENTRY; + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + ASSERT(rwsem_is_locked(&skc->skc_sem)); +#endif + /* + * Free empty slabs which have not been touched in skc_delay + * seconds. This delay time is important to avoid thrashing. + * Empty slabs will be at the end of the skc_partial_list. + */ + list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, + sks_list) { + if (atomic_read(&sks->sks_ref) > 0) + break; + + if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) { + slab_free(sks); + rc++; + } + } + + /* Returns number of slabs reclaimed */ + RETURN(rc); } -static void -kmem_cache_generic_destructor(void *ptr, kmem_cache_t *cache, unsigned long flags) +static int +slab_reclaim(spl_kmem_cache_t *skc) { - kmem_cache_cb_t *kcc; - kmem_destructor_t destructor; - void *private; + int rc; + ENTRY; - /* No valid destructor flags */ - ASSERT(flags == 0); + down_write(&skc->skc_sem); + rc = __slab_reclaim(skc); + up_write(&skc->skc_sem); - /* We can be called with interrupts disabled so it is critical that - * this function and the registered constructor never sleep. - */ - while (!down_read_trylock(&kmem_cache_sem)); + RETURN(rc); +} - /* Callback list must be in sync with linux slab caches */ - kcc = kmem_cache_find_cache_cb(cache); - ASSERT(kcc); - ASSERT(kcc->kcc_magic == KCC_MAGIC); - atomic_inc(&kcc->kcc_ref); +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, + spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + spl_kmem_cache_t *skc; + int i, kmem_flags = KM_SLEEP; + ENTRY; - destructor = kcc->kcc_destructor; - private = kcc->kcc_private; + /* We may be called when there is a non-zero preempt_count or + * interrupts are disabled is which case we must not sleep. + */ + if (current_thread_info()->preempt_count || irqs_disabled()) + kmem_flags = KM_NOSLEEP; - up_read(&kmem_cache_sem); + /* Allocate new cache memory and initialize. */ + skc = (spl_kmem_cache_t *)kmem_alloc(sizeof(*skc), kmem_flags); + if (skc == NULL) + RETURN(NULL); - /* Solaris destructor takes no flags, silently eat them */ - if (destructor) - destructor(ptr, private); + skc->skc_magic = SKC_MAGIC; - atomic_dec(&kcc->kcc_ref); + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags); + if (skc->skc_name == NULL) { + kmem_free(skc, sizeof(*skc)); + RETURN(NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_chunk_size = 0; /* XXX: Needed only when implementing */ + skc->skc_slab_size = 0; /* small slab object optimizations */ + skc->skc_max_chunks = 0; /* which are yet supported. */ + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + + skc->skc_hash_bits = SPL_KMEM_CACHE_HASH_BITS; + skc->skc_hash_size = SPL_KMEM_CACHE_HASH_SIZE; + skc->skc_hash_elts = SPL_KMEM_CACHE_HASH_ELTS; + skc->skc_hash = (struct hlist_head *) + kmem_alloc(skc->skc_hash_size, kmem_flags); + if (skc->skc_hash == NULL) { + kmem_free(skc->skc_name, skc->skc_name_size); + kmem_free(skc, sizeof(*skc)); + } + + for (i = 0; i < skc->skc_hash_elts; i++) + INIT_HLIST_HEAD(&skc->skc_hash[i]); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + init_rwsem(&skc->skc_sem); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_hash_depth = 0; + skc->skc_hash_max = 0; + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + RETURN(skc); } +EXPORT_SYMBOL(spl_kmem_cache_create); -/* Arguments are ignored */ -static int -kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask) +/* The caller must ensure there are no racing calls to + * spl_kmem_cache_alloc() for this spl_kmem_cache_t when + * it is being destroyed. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { - kmem_cache_cb_t *kcc; - int total = 0; + spl_kmem_slab_t *sks, *m; + ENTRY; - /* Under linux a shrinker is not tightly coupled with a slab - * cache. In fact linux always systematically trys calling all - * registered shrinker callbacks until its target reclamation level - * is reached. Because of this we only register one shrinker - * function in the shim layer for all slab caches. And we always - * attempt to shrink all caches when this generic shrinker is called. - */ - down_read(&kmem_cache_sem); - - list_for_each_entry(kcc, &kmem_cache_list, kcc_list) { - ASSERT(kcc); - ASSERT(kcc->kcc_magic == KCC_MAGIC); - - /* Take a reference on the cache in question. If that - * cache is contended simply skip it, it may already be - * in the process of a reclaim or the ctor/dtor may be - * running in either case it's best to skip it. - */ - atomic_inc(&kcc->kcc_ref); - if (atomic_read(&kcc->kcc_ref) > 1) { - atomic_dec(&kcc->kcc_ref); - continue; - } + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + down_write(&skc->skc_sem); - /* Under linux the desired number and gfp type of objects - * is passed to the reclaiming function as a sugested reclaim - * target. I do not pass these args on because reclaim - * policy is entirely up to the owner under solaris. We only - * pass on the pre-registered private data. - */ - if (kcc->kcc_reclaim) - kcc->kcc_reclaim(kcc->kcc_private); - - atomic_dec(&kcc->kcc_ref); - total += 1; - } - - /* Under linux we should return the remaining number of entires in - * the cache. Unfortunately, I don't see an easy way to safely - * emulate this behavior so I'm returning one entry per cache which - * was registered with the generic shrinker. This should fake out - * the linux VM when it attempts to shrink caches. + /* Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ - up_read(&kmem_cache_sem); + ASSERT(list_empty(&skc->skc_complete_list)); - return total; + list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list) + slab_free(sks); + + kmem_free(skc->skc_hash, skc->skc_hash_size); + kmem_free(skc->skc_name, skc->skc_name_size); + kmem_free(skc, sizeof(*skc)); + up_write(&skc->skc_sem); + + EXIT; } +EXPORT_SYMBOL(spl_kmem_cache_destroy); -/* Ensure the __kmem_cache_create/__kmem_cache_destroy macros are - * removed here to prevent a recursive substitution, we want to call - * the native linux version. +/* The kernel provided hash_ptr() function behaves exceptionally badly + * when all the addresses are page aligned which is likely the case + * here. To avoid this issue shift off the low order non-random bits. */ -#undef kmem_cache_create -#undef kmem_cache_destroy -#undef kmem_cache_alloc -#undef kmem_cache_free +static unsigned long +spl_hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr >> PAGE_SHIFT, bits); +} -kmem_cache_t * -__kmem_cache_create(char *name, size_t size, size_t align, - kmem_constructor_t constructor, - kmem_destructor_t destructor, - kmem_reclaim_t reclaim, - void *priv, void *vmp, int flags) +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) { - kmem_cache_t *cache; - kmem_cache_cb_t *kcc; - int shrinker_flag = 0; - char *cache_name; + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko; + void *obj; + unsigned long key; ENTRY; - /* XXX: - Option currently unsupported by shim layer */ - ASSERT(!vmp); - ASSERT(flags == 0); + down_write(&skc->skc_sem); +restart: + /* Check for available objects from the partial slabs */ + if (!list_empty(&skc->skc_partial_list)) { + sks = list_first_entry(&skc->skc_partial_list, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(atomic_read(&sks->sks_ref) < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + sko = list_first_entry(&sks->sks_free_list, + spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list, add to used hash */ + list_del_init(&sko->sko_list); + key = spl_hash_ptr(sko->sko_addr, skc->skc_hash_bits); + hlist_add_head_rcu(&sko->sko_hlist, &skc->skc_hash[key]); + + sks->sks_age = jiffies; + atomic_inc(&sks->sks_ref); + skc->skc_obj_alloc++; + + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + if (atomic_read(&sks->sks_ref) == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } - cache_name = kzalloc(strlen(name) + 1, GFP_KERNEL); - if (cache_name == NULL) - RETURN(NULL); + /* Move slab to skc_complete_list when full */ + if (atomic_read(&sks->sks_ref) == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + + GOTO(out_lock, obj = sko->sko_addr); + } + + up_write(&skc->skc_sem); - strcpy(cache_name, name); - - /* When your slab is implemented in terms of the slub it - * is possible similarly sized slab caches will be merged. - * For our implementation we must make sure this never - * happens because we require a unique cache address to - * use as a hash key when looking up the constructor, - * destructor, and shrinker registered for each unique - * type of slab cache. Passing any of the following flags - * will prevent the slub merging. - * - * SLAB_RED_ZONE - * SLAB_POISON - * SLAB_STORE_USER - * SLAB_TRACE - * SLAB_DESTROY_BY_RCU + /* No available objects create a new slab. Since this is an + * expensive operation we do it without holding the semaphore + * and only briefly aquire it when we link in the fully + * allocated and constructed slab. */ -#ifdef CONFIG_SLUB - flags |= SLAB_STORE_USER; -#endif -#ifdef HAVE_KMEM_CACHE_CREATE_DTOR - cache = kmem_cache_create(cache_name, size, align, flags, - kmem_cache_generic_constructor, - kmem_cache_generic_destructor); -#else - cache = kmem_cache_create(cache_name, size, align, flags, NULL); -#endif - if (cache == NULL) - RETURN(NULL); + /* Under Solaris if the KM_SLEEP flag is passed we may never + * fail, so sleep as long as needed. Additionally, since we are + * using vmem_alloc() KM_NOSLEEP is not an option and we must + * fail. Shifting to allocating our own pages and mapping the + * virtual address space may allow us to bypass this issue. + */ + if (!flags) + flags |= KM_SLEEP; - /* Register shared shrinker function on initial cache create */ - down_read(&kmem_cache_sem); - if (list_empty(&kmem_cache_list)) { -#ifdef HAVE_SET_SHRINKER - kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS, - kmem_cache_generic_shrinker); - if (kmem_cache_shrinker == NULL) { - kmem_cache_destroy(cache); - up_read(&kmem_cache_sem); - RETURN(NULL); - } -#else - register_shrinker(&kmem_cache_shrinker); -#endif - } - up_read(&kmem_cache_sem); + if (flags & KM_SLEEP) + flags |= __GFP_NOFAIL; + else + GOTO(out, obj = NULL); - kcc = kmem_cache_add_cache_cb(cache, constructor, destructor, - reclaim, priv, vmp); - if (kcc == NULL) { - if (shrinker_flag) /* New shrinker registered must be removed */ -#ifdef HAVE_SET_SHRINKER - remove_shrinker(kmem_cache_shrinker); -#else - unregister_shrinker(&kmem_cache_shrinker); -#endif + sks = slab_alloc(skc, flags); + if (sks == NULL) + GOTO(out, obj = NULL); + + /* Run all the constructors now that the slab is fully allocated */ + list_for_each_entry(sko, &sks->sks_free_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); - kmem_cache_destroy(cache); - RETURN(NULL); - } + if (skc->skc_ctor) + skc->skc_ctor(sko->sko_addr, skc->skc_private, flags); + } - RETURN(cache); + /* Link the newly created slab in to the skc_partial_list, + * and retry the allocation which will now succeed. + */ + down_write(&skc->skc_sem); + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + GOTO(restart, obj = NULL); + +out_lock: + up_write(&skc->skc_sem); +out: + RETURN(obj); } -EXPORT_SYMBOL(__kmem_cache_create); +EXPORT_SYMBOL(spl_kmem_cache_alloc); -/* Return code provided despite Solaris's void return. There should be no - * harm here since the Solaris versions will ignore it anyway. */ -int -__kmem_cache_destroy(kmem_cache_t *cache) +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { - kmem_cache_cb_t *kcc; - char *name; - int rc; + struct hlist_head *head; + struct hlist_node *node; + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; ENTRY; - down_read(&kmem_cache_sem); - kcc = kmem_cache_find_cache_cb(cache); - if (kcc == NULL) { - up_read(&kmem_cache_sem); - RETURN(-EINVAL); - } - atomic_inc(&kcc->kcc_ref); - up_read(&kmem_cache_sem); + down_write(&skc->skc_sem); - name = (char *)kmem_cache_name(cache); + head = &skc->skc_hash[spl_hash_ptr(obj, skc->skc_hash_bits)]; + hlist_for_each_entry_rcu(sko, node, head, sko_hlist) { + if (sko->sko_addr == obj) { + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + break; + } + } -#ifdef HAVE_KMEM_CACHE_DESTROY_INT - rc = kmem_cache_destroy(cache); -#else - kmem_cache_destroy(cache); - rc = 0; -#endif + ASSERT(sko != NULL); /* Obj must be in hash */ + ASSERT(sks != NULL); /* Obj must reference slab */ + ASSERT(sks->sks_cache == skc); + hlist_del_init(&sko->sko_hlist); + list_add(&sko->sko_list, &sks->sks_free_list); - atomic_dec(&kcc->kcc_ref); - kmem_cache_remove_cache_cb(kcc); - kfree(name); + sks->sks_age = jiffies; + atomic_dec(&sks->sks_ref); + skc->skc_obj_alloc--; - /* Unregister generic shrinker on removal of all caches */ - down_read(&kmem_cache_sem); - if (list_empty(&kmem_cache_list)) -#ifdef HAVE_SET_SHRINKER - remove_shrinker(kmem_cache_shrinker); -#else - unregister_shrinker(&kmem_cache_shrinker); -#endif + /* Move slab to skc_partial_list when no longer full. Slabs + * are added to the kead to keep the partial list is quasi + * full sorted order. Fuller at the head, emptier at the tail. + */ + if (atomic_read(&sks->sks_ref) == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } - up_read(&kmem_cache_sem); - RETURN(rc); + /* Move emply slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (atomic_read(&sks->sks_ref) == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } + + __slab_reclaim(skc); + up_write(&skc->skc_sem); } -EXPORT_SYMBOL(__kmem_cache_destroy); - -/* Under Solaris if the KM_SLEEP flag is passed we absolutely must - * sleep until we are allocated the memory. Under Linux you can still - * get a memory allocation failure, so I'm forced to keep requesting - * the memory even if the system is under substantial memory pressure - * of fragmentation prevents the allocation from succeeded. This is - * not the correct fix, or even a good one. But it will do for now. - */ -void * -__kmem_cache_alloc(kmem_cache_t *cache, gfp_t flags) -{ - void *obj; - ENTRY; +EXPORT_SYMBOL(spl_kmem_cache_free); -restart: - obj = kmem_cache_alloc(cache, flags); - if ((obj == NULL) && (flags & KM_SLEEP)) { -#ifdef DEBUG_KMEM - atomic64_inc(&kmem_cache_alloc_failed); -#endif /* DEBUG_KMEM */ - GOTO(restart, obj); - } +static int +kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask) +{ + spl_kmem_cache_t *skc; - /* When destructor support is removed we must be careful not to - * use the provided constructor which will end up being called - * more often than the destructor which we only call on free. Thus - * we call the proper constructor when there is no destructor. + /* Under linux a shrinker is not tightly coupled with a slab + * cache. In fact linux always systematically trys calling all + * registered shrinker callbacks until its target reclamation level + * is reached. Because of this we only register one shrinker + * function in the shim layer for all slab caches. And we always + * attempt to shrink all caches when this generic shrinker is called. */ -#ifndef HAVE_KMEM_CACHE_CREATE_DTOR -#ifdef HAVE_3ARG_KMEM_CACHE_CREATE_CTOR - kmem_cache_generic_constructor(obj, cache, flags); -#else - kmem_cache_generic_constructor(cache, obj); -#endif /* HAVE_KMEM_CACHE_CREATE_DTOR */ -#endif /* HAVE_3ARG_KMEM_CACHE_CREATE_CTOR */ + down_read(&spl_kmem_cache_sem); - RETURN(obj); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) + spl_kmem_cache_reap_now(skc); + + up_read(&spl_kmem_cache_sem); + + /* XXX: Under linux we should return the remaining number of + * entries in the cache. We should do this as well. + */ + return 1; } -EXPORT_SYMBOL(__kmem_cache_alloc); void -__kmem_cache_free(kmem_cache_t *cache, void *obj) +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { -#ifndef HAVE_KMEM_CACHE_CREATE_DTOR - kmem_cache_generic_destructor(obj, cache, 0); -#endif - kmem_cache_free(cache, obj); + ENTRY; + ASSERT(skc && skc->skc_magic == SKC_MAGIC); + + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + + slab_reclaim(skc); + EXIT; } -EXPORT_SYMBOL(__kmem_cache_free); +EXPORT_SYMBOL(spl_kmem_cache_reap_now); void -__kmem_reap(void) +spl_kmem_reap(void) { - ENTRY; - /* Since there's no easy hook in to linux to force all the registered - * shrinkers to run we just run the ones registered for this shim */ kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL); - EXIT; } -EXPORT_SYMBOL(__kmem_reap); +EXPORT_SYMBOL(spl_kmem_reap); int -kmem_init(void) +spl_kmem_init(void) { - int i; + int rc = 0; ENTRY; - init_rwsem(&kmem_cache_sem); - INIT_LIST_HEAD(&kmem_cache_list); + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + + spl_slab_cache = NULL; + spl_obj_cache = NULL; + + spl_slab_cache = kmem_cache_create("spl_slab_cache", + sizeof(spl_kmem_slab_t), + 0, 0, NULL); + if (spl_slab_cache == NULL) + GOTO(out_cache, rc = -ENOMEM); - for (i = 0; i < KMEM_CACHE_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&kmem_cache_table[i]); + spl_obj_cache = kmem_cache_create("spl_obj_cache", + sizeof(spl_kmem_obj_t), + 0, 0, NULL); + if (spl_obj_cache == NULL) + GOTO(out_cache, rc = -ENOMEM); + +#ifdef HAVE_SET_SHRINKER + spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS, shrinker); + if (spl_kmem_cache_shrinker == NULL) + GOTO(out_cache, rc = -ENOMEM); +#else + register_shrinker(&spl_kmem_cache_shrinker); +#endif #ifdef DEBUG_KMEM + { int i; atomic64_set(&kmem_alloc_used, 0); atomic64_set(&vmem_alloc_used, 0); + atomic64_set(&kmem_cache_alloc_failed, 0); spin_lock_init(&kmem_lock); INIT_LIST_HEAD(&kmem_list); @@ -558,10 +677,18 @@ kmem_init(void) for (i = 0; i < VMEM_TABLE_SIZE; i++) INIT_HLIST_HEAD(&vmem_table[i]); - - atomic64_set(&kmem_cache_alloc_failed, 0); + } #endif - RETURN(0); + RETURN(rc); + +out_cache: + if (spl_obj_cache) + (void)kmem_cache_destroy(spl_obj_cache); + + if (spl_slab_cache) + (void)kmem_cache_destroy(spl_slab_cache); + + RETURN(rc); } #ifdef DEBUG_KMEM @@ -609,53 +736,61 @@ sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) #endif /* DEBUG_KMEM */ void -kmem_fini(void) +spl_kmem_fini(void) { - ENTRY; #ifdef DEBUG_KMEM - { - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - /* Display all unreclaimed memory addresses, including the - * allocation size and the first few bytes of what's located - * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. */ - if (atomic64_read(&kmem_alloc_used) != 0) - CWARN("kmem leaked %ld/%ld bytes\n", - atomic_read(&kmem_alloc_used), kmem_alloc_max); - - spin_lock_irqsave(&kmem_lock, flags); - if (!list_empty(&kmem_list)) - CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", - "address", "size", "data", "func", "line"); - - list_for_each_entry(kd, &kmem_list, kd_list) - CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", - kd->kd_addr, kd->kd_size, - sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(&kmem_lock, flags); - - if (atomic64_read(&vmem_alloc_used) != 0) - CWARN("vmem leaked %ld/%ld bytes\n", - atomic_read(&vmem_alloc_used), vmem_alloc_max); - - spin_lock_irqsave(&vmem_lock, flags); - if (!list_empty(&vmem_list)) - CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", - "address", "size", "data", "func", "line"); - - list_for_each_entry(kd, &vmem_list, kd_list) - CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", - kd->kd_addr, kd->kd_size, - sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(&vmem_lock, flags); - } + unsigned long flags; + kmem_debug_t *kd; + char str[17]; + + /* Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. */ + if (atomic64_read(&kmem_alloc_used) != 0) + CWARN("kmem leaked %ld/%ld bytes\n", + atomic_read(&kmem_alloc_used), kmem_alloc_max); + + spin_lock_irqsave(&kmem_lock, flags); + if (!list_empty(&kmem_list)) + CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", + "address", "size", "data", "func", "line"); + + list_for_each_entry(kd, &kmem_list, kd_list) + CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", + kd->kd_addr, kd->kd_size, + sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + + spin_unlock_irqrestore(&kmem_lock, flags); + + if (atomic64_read(&vmem_alloc_used) != 0) + CWARN("vmem leaked %ld/%ld bytes\n", + atomic_read(&vmem_alloc_used), vmem_alloc_max); + + spin_lock_irqsave(&vmem_lock, flags); + if (!list_empty(&vmem_list)) + CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", + "address", "size", "data", "func", "line"); + + list_for_each_entry(kd, &vmem_list, kd_list) + CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", + kd->kd_addr, kd->kd_size, + sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + + spin_unlock_irqrestore(&vmem_lock, flags); +#endif + ENTRY; + +#ifdef HAVE_SET_SHRINKER + remove_shrinker(spl_kmem_cache_shrinker); +#else + unregister_shrinker(&spl_kmem_cache_shrinker); #endif + + (void)kmem_cache_destroy(spl_obj_cache); + (void)kmem_cache_destroy(spl_slab_cache); + EXIT; } diff --git a/modules/spl/spl-vnode.c b/modules/spl/spl-vnode.c index 7ff35a8f6..f6dbc00c3 100644 --- a/modules/spl/spl-vnode.c +++ b/modules/spl/spl-vnode.c @@ -633,7 +633,7 @@ void vn_fini(void) { file_t *fp, *next_fp; - int rc, leaked = 0; + int leaked = 0; ENTRY; spin_lock(&vn_file_lock); @@ -644,19 +644,14 @@ vn_fini(void) leaked++; } - rc = kmem_cache_destroy(vn_file_cache); - if (rc) - CWARN("Warning leaked vn_file_cache objects, %d\n", rc); - + kmem_cache_destroy(vn_file_cache); vn_file_cache = NULL; spin_unlock(&vn_file_lock); if (leaked > 0) CWARN("Warning %d files leaked\n", leaked); - rc = kmem_cache_destroy(vn_cache); - if (rc) - CWARN("Warning leaked vn_cache objects, %d\n", rc); + kmem_cache_destroy(vn_cache); EXIT; return; diff --git a/modules/splat/splat-kmem.c b/modules/splat/splat-kmem.c index 277a9afe0..7342052c1 100644 --- a/modules/splat/splat-kmem.c +++ b/modules/splat/splat-kmem.c @@ -39,16 +39,24 @@ #define SPLAT_KMEM_TEST2_DESC "Memory allocation test (kmem_zalloc)" #define SPLAT_KMEM_TEST3_ID 0x0103 -#define SPLAT_KMEM_TEST3_NAME "slab_alloc" -#define SPLAT_KMEM_TEST3_DESC "Slab constructor/destructor test" +#define SPLAT_KMEM_TEST3_NAME "vmem_alloc" +#define SPLAT_KMEM_TEST3_DESC "Memory allocation test (vmem_alloc)" #define SPLAT_KMEM_TEST4_ID 0x0104 -#define SPLAT_KMEM_TEST4_NAME "slab_reap" -#define SPLAT_KMEM_TEST4_DESC "Slab reaping test" +#define SPLAT_KMEM_TEST4_NAME "vmem_zalloc" +#define SPLAT_KMEM_TEST4_DESC "Memory allocation test (vmem_zalloc)" #define SPLAT_KMEM_TEST5_ID 0x0105 -#define SPLAT_KMEM_TEST5_NAME "vmem_alloc" -#define SPLAT_KMEM_TEST5_DESC "Memory allocation test (vmem_alloc)" +#define SPLAT_KMEM_TEST5_NAME "kmem_cache1" +#define SPLAT_KMEM_TEST5_DESC "Slab ctor/dtor test (small)" + +#define SPLAT_KMEM_TEST6_ID 0x0106 +#define SPLAT_KMEM_TEST6_NAME "kmem_cache2" +#define SPLAT_KMEM_TEST6_DESC "Slab ctor/dtor test (large)" + +#define SPLAT_KMEM_TEST7_ID 0x0107 +#define SPLAT_KMEM_TEST7_NAME "kmem_reap" +#define SPLAT_KMEM_TEST7_DESC "Slab reaping test" #define SPLAT_KMEM_ALLOC_COUNT 10 #define SPLAT_VMEM_ALLOC_COUNT 10 @@ -142,16 +150,91 @@ splat_kmem_test2(struct file *file, void *arg) return rc; } +static int +splat_kmem_test3(struct file *file, void *arg) +{ + void *ptr[SPLAT_VMEM_ALLOC_COUNT]; + int size = PAGE_SIZE; + int i, count, rc = 0; + + while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + count = 0; + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { + ptr[i] = vmem_alloc(size, KM_SLEEP); + if (ptr[i]) + count++; + } + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) + if (ptr[i]) + vmem_free(ptr[i], size); + + splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + "%d byte allocations, %d/%d successful\n", + size, count, SPLAT_VMEM_ALLOC_COUNT); + if (count != SPLAT_VMEM_ALLOC_COUNT) + rc = -ENOMEM; + + size *= 2; + } + + return rc; +} + +static int +splat_kmem_test4(struct file *file, void *arg) +{ + void *ptr[SPLAT_VMEM_ALLOC_COUNT]; + int size = PAGE_SIZE; + int i, j, count, rc = 0; + + while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + count = 0; + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { + ptr[i] = vmem_zalloc(size, KM_SLEEP); + if (ptr[i]) + count++; + } + + /* Ensure buffer has been zero filled */ + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { + for (j = 0; j < size; j++) { + if (((char *)ptr[i])[j] != '\0') { + splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + "%d-byte allocation was " + "not zeroed\n", size); + rc = -EFAULT; + } + } + } + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) + if (ptr[i]) + vmem_free(ptr[i], size); + + splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + "%d byte allocations, %d/%d successful\n", + size, count, SPLAT_VMEM_ALLOC_COUNT); + if (count != SPLAT_VMEM_ALLOC_COUNT) + rc = -ENOMEM; + + size *= 2; + } + + return rc; +} + #define SPLAT_KMEM_TEST_MAGIC 0x004488CCUL #define SPLAT_KMEM_CACHE_NAME "kmem_test" -#define SPLAT_KMEM_CACHE_SIZE 256 #define SPLAT_KMEM_OBJ_COUNT 128 -#define SPLAT_KMEM_OBJ_RECLAIM 64 +#define SPLAT_KMEM_OBJ_RECLAIM 16 typedef struct kmem_cache_data { - char kcd_buf[SPLAT_KMEM_CACHE_SIZE]; unsigned long kcd_magic; int kcd_flag; + char kcd_buf[0]; } kmem_cache_data_t; typedef struct kmem_cache_priv { @@ -159,48 +242,52 @@ typedef struct kmem_cache_priv { struct file *kcp_file; kmem_cache_t *kcp_cache; kmem_cache_data_t *kcp_kcd[SPLAT_KMEM_OBJ_COUNT]; + int kcp_size; int kcp_count; int kcp_rc; } kmem_cache_priv_t; static int -splat_kmem_test34_constructor(void *ptr, void *priv, int flags) +splat_kmem_cache_test_constructor(void *ptr, void *priv, int flags) { - kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; + kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; if (kcd) { - memset(kcd->kcd_buf, 0xaa, SPLAT_KMEM_CACHE_SIZE); - kcd->kcd_flag = 1; - if (kcp) { kcd->kcd_magic = kcp->kcp_magic; kcp->kcp_count++; } + + memset(kcd->kcd_buf, 0xaa, kcp->kcp_size - (sizeof *kcd)); + kcd->kcd_flag = 1; } return 0; } static void -splat_kmem_test34_destructor(void *ptr, void *priv) +splat_kmem_cache_test_destructor(void *ptr, void *priv) { - kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; + kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; if (kcd) { - memset(kcd->kcd_buf, 0xbb, SPLAT_KMEM_CACHE_SIZE); - kcd->kcd_flag = 0; - - if (kcp) + if (kcp) { + kcd->kcd_magic = 0; kcp->kcp_count--; + } + + memset(kcd->kcd_buf, 0xbb, kcp->kcp_size - (sizeof *kcd)); + kcd->kcd_flag = 0; } return; } static int -splat_kmem_test3(struct file *file, void *arg) +splat_kmem_cache_size_test(struct file *file, void *arg, + char *name, int size, int flags) { kmem_cache_t *cache = NULL; kmem_cache_data_t *kcd = NULL; @@ -209,22 +296,23 @@ splat_kmem_test3(struct file *file, void *arg) kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC; kcp.kcp_file = file; + kcp.kcp_size = size; kcp.kcp_count = 0; kcp.kcp_rc = 0; - cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, sizeof(*kcd), 0, - splat_kmem_test34_constructor, - splat_kmem_test34_destructor, - NULL, &kcp, NULL, 0); + cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp.kcp_size, 0, + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + NULL, &kcp, NULL, flags); if (!cache) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); return -ENOMEM; } - kcd = kmem_cache_alloc(cache, 0); + kcd = kmem_cache_alloc(cache, KM_SLEEP); if (!kcd) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Unable to allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; @@ -232,7 +320,7 @@ splat_kmem_test3(struct file *file, void *arg) } if (!kcd->kcd_flag) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Failed to run contructor for '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; @@ -240,7 +328,7 @@ splat_kmem_test3(struct file *file, void *arg) } if (kcd->kcd_magic != kcp.kcp_magic) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Failed to pass private data to constructor " "for '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; @@ -248,23 +336,20 @@ splat_kmem_test3(struct file *file, void *arg) } max = kcp.kcp_count; - - /* Destructor's run lazily so it hard to check correctness here. - * We assume if it doesn't crash the free worked properly */ kmem_cache_free(cache, kcd); /* Destroy the entire cache which will force destructors to * run and we can verify one was called for every object */ kmem_cache_destroy(cache); if (kcp.kcp_count) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Failed to run destructor on all slab objects " "for '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; } - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, - "%d allocated/destroyed objects for '%s'\n", + splat_vprint(file, name, + "Successfully ran ctors/dtors for %d elements in '%s'\n", max, SPLAT_KMEM_CACHE_NAME); return rc; @@ -277,19 +362,38 @@ out_free: return rc; } +static int +splat_kmem_test5(struct file *file, void *arg) +{ + return splat_kmem_cache_size_test(file, arg, SPLAT_KMEM_TEST5_NAME, + sizeof(kmem_cache_data_t) * 1, 0); +} + +static int +splat_kmem_test6(struct file *file, void *arg) +{ + return splat_kmem_cache_size_test(file, arg, SPLAT_KMEM_TEST6_NAME, + sizeof(kmem_cache_data_t) * 1024, 0); +} + static void -splat_kmem_test4_reclaim(void *priv) +splat_kmem_cache_test_reclaim(void *priv) { kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; - int i; + int i, count; + + count = min(SPLAT_KMEM_OBJ_RECLAIM, kcp->kcp_count); + splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST7_NAME, + "Reaping %d objects from '%s'\n", count, + SPLAT_KMEM_CACHE_NAME); - splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST4_NAME, - "Reaping %d objects from '%s'\n", - SPLAT_KMEM_OBJ_RECLAIM, SPLAT_KMEM_CACHE_NAME); - for (i = 0; i < SPLAT_KMEM_OBJ_RECLAIM; i++) { + for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) { if (kcp->kcp_kcd[i]) { kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]); kcp->kcp_kcd[i] = NULL; + + if (--count == 0) + break; } } @@ -297,24 +401,25 @@ splat_kmem_test4_reclaim(void *priv) } static int -splat_kmem_test4(struct file *file, void *arg) +splat_kmem_test7(struct file *file, void *arg) { kmem_cache_t *cache; kmem_cache_priv_t kcp; - int i, rc = 0, max, reclaim_percent, target_percent; + int i, rc = 0; kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC; kcp.kcp_file = file; + kcp.kcp_size = 256; kcp.kcp_count = 0; kcp.kcp_rc = 0; - cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, - sizeof(kmem_cache_data_t), 0, - splat_kmem_test34_constructor, - splat_kmem_test34_destructor, - splat_kmem_test4_reclaim, &kcp, NULL, 0); + cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp.kcp_size, 0, + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + splat_kmem_cache_test_reclaim, + &kcp, NULL, 0); if (!cache) { - splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); return -ENOMEM; } @@ -323,36 +428,57 @@ splat_kmem_test4(struct file *file, void *arg) for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) { /* All allocations need not succeed */ - kcp.kcp_kcd[i] = kmem_cache_alloc(cache, 0); + kcp.kcp_kcd[i] = kmem_cache_alloc(cache, KM_SLEEP); if (!kcp.kcp_kcd[i]) { - splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, "Unable to allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); } } - max = kcp.kcp_count; - ASSERT(max > 0); - - /* Force shrinker to run */ - kmem_reap(); - - /* Reclaim reclaimed objects, this ensure the destructors are run */ - kmem_cache_reap_now(cache); - - reclaim_percent = ((kcp.kcp_count * 100) / max); - target_percent = (((SPLAT_KMEM_OBJ_COUNT - SPLAT_KMEM_OBJ_RECLAIM) * 100) / - SPLAT_KMEM_OBJ_COUNT); - splat_vprint(file, SPLAT_KMEM_TEST4_NAME, - "%d%% (%d/%d) of previous size, target of " - "%d%%-%d%% for '%s'\n", reclaim_percent, kcp.kcp_count, - max, target_percent - 10, target_percent + 10, - SPLAT_KMEM_CACHE_NAME); - if ((reclaim_percent < target_percent - 10) || - (reclaim_percent > target_percent + 10)) - rc = -EINVAL; + ASSERT(kcp.kcp_count > 0); + + /* Request the slab cache free any objects it can. For a few reasons + * this may not immediately result in more free memory even if objects + * are freed. First off, due to fragmentation we may not be able to + * reclaim any slabs. Secondly, even if we do we fully clear some + * slabs we will not want to immedately reclaim all of them because + * we may contend with cache allocs and thrash. What we want to see + * is slab size decrease more gradually as it becomes clear they + * will not be needed. This should be acheivable in less than minute + * if it takes longer than this something has gone wrong. + */ + for (i = 0; i < 60; i++) { + kmem_cache_reap_now(cache); + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, + "%s cache objects %d, slabs %u/%u objs %u/%u\n", + SPLAT_KMEM_CACHE_NAME, kcp.kcp_count, + (unsigned)cache->skc_slab_alloc, + (unsigned)cache->skc_slab_total, + (unsigned)cache->skc_obj_alloc, + (unsigned)cache->skc_obj_total); + + if (cache->skc_obj_total == 0) + break; + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + + if (cache->skc_obj_total == 0) { + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, + "Successfully created %d objects " + "in cache %s and reclaimed them\n", + SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME); + } else { + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, + "Failed to reclaim %u/%d objects from cache %s\n", + (unsigned)cache->skc_obj_total, SPLAT_KMEM_OBJ_COUNT, + SPLAT_KMEM_CACHE_NAME); + rc = -ENOMEM; + } - /* Cleanup our mess */ + /* Cleanup our mess (for failure case of time expiring) */ for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) if (kcp.kcp_kcd[i]) kmem_cache_free(cache, kcp.kcp_kcd[i]); @@ -362,38 +488,6 @@ splat_kmem_test4(struct file *file, void *arg) return rc; } -static int -splat_kmem_test5(struct file *file, void *arg) -{ - void *ptr[SPLAT_VMEM_ALLOC_COUNT]; - int size = PAGE_SIZE; - int i, count, rc = 0; - - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { - count = 0; - - for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { - ptr[i] = vmem_alloc(size, KM_SLEEP); - if (ptr[i]) - count++; - } - - for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) - if (ptr[i]) - vmem_free(ptr[i], size); - - splat_vprint(file, SPLAT_KMEM_TEST5_NAME, - "%d byte allocations, %d/%d successful\n", - size, count, SPLAT_VMEM_ALLOC_COUNT); - if (count != SPLAT_VMEM_ALLOC_COUNT) - rc = -ENOMEM; - - size *= 2; - } - - return rc; -} - splat_subsystem_t * splat_kmem_init(void) { @@ -421,6 +515,10 @@ splat_kmem_init(void) SPLAT_KMEM_TEST4_ID, splat_kmem_test4); SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC, SPLAT_KMEM_TEST5_ID, splat_kmem_test5); + SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC, + SPLAT_KMEM_TEST6_ID, splat_kmem_test6); + SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC, + SPLAT_KMEM_TEST7_ID, splat_kmem_test7); return sub; } @@ -429,6 +527,8 @@ void splat_kmem_fini(splat_subsystem_t *sub) { ASSERT(sub); + SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID); + SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID); SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID); SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID); SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID); |