diff options
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | include/sys/kmem.h | 119 | ||||
-rw-r--r-- | include/sys/types.h | 1 | ||||
-rw-r--r-- | modules/spl/spl-generic.c | 7 | ||||
-rw-r--r-- | modules/spl/spl-kmem.c | 989 | ||||
-rw-r--r-- | modules/spl/spl-vnode.c | 11 | ||||
-rw-r--r-- | modules/splat/splat-kmem.c | 310 |
7 files changed, 875 insertions, 569 deletions
@@ -1,3 +1,10 @@ +2008-06-13 Brian Behlendorf <[email protected]> + + * : modules/sys/kmem-slab.c : Re-implemented the slab to no + longer be based on the linux slab but to be it's own complete + implementation. The new slab behaves much more like the + Solaris slab than the Linux slab. + 2008-06-04 Brian Behlendorf <[email protected]> * : Tag spl-0.3.2 diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 0b1b53687..082db032a 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -308,11 +308,11 @@ kmem_alloc_tryhard(size_t size, size_t *alloc_size, int kmflags) /* * Slab allocation interfaces */ -#undef KMC_NOTOUCH /* No linux analog */ +#undef KMC_NOTOUCH /* XXX: Unsupported */ #define KMC_NODEBUG 0x00000000 /* Default behavior */ -#define KMC_NOMAGAZINE /* No linux analog */ -#define KMC_NOHASH /* No linux analog */ -#define KMC_QCACHE /* No linux analog */ +#define KMC_NOMAGAZINE /* XXX: Unsupported */ +#define KMC_NOHASH /* XXX: Unsupported */ +#define KMC_QCACHE /* XXX: Unsupported */ #define KMC_REAP_CHUNK 256 #define KMC_DEFAULT_SEEKS DEFAULT_SEEKS @@ -342,7 +342,7 @@ static __inline__ size_t kmem_maxavail(void) { #error "kmem_maxavail() not implemented" } -static __inline__ uint64_t kmem_cache_stat(kmem_cache_t *cache) { +static __inline__ uint64_t kmem_cache_stat(spl_kmem_cache_t *cache) { #error "kmem_cache_stat() not implemented" } #endif /* DEBUG_KMEM_UNIMPLEMENTED */ @@ -357,34 +357,101 @@ kmem_debugging(void) return 0; } -typedef int (*kmem_constructor_t)(void *, void *, int); -typedef void (*kmem_destructor_t)(void *, void *); -typedef void (*kmem_reclaim_t)(void *); - extern int kmem_set_warning(int flag); -extern kmem_cache_t * -__kmem_cache_create(char *name, size_t size, size_t align, - kmem_constructor_t constructor, - kmem_destructor_t destructor, - kmem_reclaim_t reclaim, + +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c + +#define SPL_KMEM_CACHE_HASH_BITS 12 /* 4k, sized for 1000's of objs */ +#define SPL_KMEM_CACHE_HASH_ELTS (1 << SPL_KMEM_CACHE_HASH_BITS) +#define SPL_KMEM_CACHE_HASH_SIZE (sizeof(struct hlist_head) * \ + SPL_KMEM_CACHE_HASH_ELTS) + +#define SPL_KMEM_CACHE_DELAY 5 +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 32 + +typedef int (*spl_kmem_ctor_t)(void *, void *, int); +typedef void (*spl_kmem_dtor_t)(void *, void *); +typedef void (*spl_kmem_reclaim_t)(void *); + +typedef struct spl_kmem_obj { + uint32_t sko_magic; /* Sanity magic */ + uint32_t sko_flags; /* Per object flags */ + void *sko_addr; /* Buffer address */ + struct spl_kmem_slab *sko_slab; /* Owned by slab */ + struct list_head sko_list; /* Free object list linkage */ + struct hlist_node sko_hlist; /* Used object hash linkage */ +} spl_kmem_obj_t; + +typedef struct spl_kmem_slab { + uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_objs; /* Objects per slab */ + struct spl_kmem_cache *sks_cache; /* Owned by cache */ + struct list_head sks_list; /* Slab list linkage */ + struct list_head sks_free_list; /* Free object list */ + unsigned long sks_age; /* Last modify jiffie */ + atomic_t sks_ref; /* Ref count used objects */ +} spl_kmem_slab_t; + +typedef struct spl_kmem_cache { + uint32_t skc_magic; /* Sanity magic */ + uint32_t skc_name_size; /* Name length */ + char *skc_name; /* Name string */ + spl_kmem_ctor_t skc_ctor; /* Constructor */ + spl_kmem_dtor_t skc_dtor; /* Destructor */ + spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ + void *skc_private; /* Private data */ + void *skc_vmp; /* Unused */ + uint32_t skc_flags; /* Flags */ + uint32_t skc_obj_size; /* Object size */ + uint32_t skc_chunk_size; /* sizeof(*obj) + alignment */ + uint32_t skc_slab_size; /* slab size */ + uint32_t skc_max_chunks; /* max chunks per slab */ + uint32_t skc_delay; /* slab reclaim interval */ + uint32_t skc_hash_bits; /* Hash table bits */ + uint32_t skc_hash_size; /* Hash table size */ + uint32_t skc_hash_elts; /* Hash table elements */ + struct hlist_head *skc_hash; /* Hash table address */ + struct list_head skc_list; /* List of caches linkage */ + struct list_head skc_complete_list;/* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ + struct rw_semaphore skc_sem; /* Cache semaphore */ + uint64_t skc_slab_fail; /* Slab alloc failures */ + uint64_t skc_slab_create;/* Slab creates */ + uint64_t skc_slab_destroy;/* Slab destroys */ + uint64_t skc_slab_total; /* Slab total */ + uint64_t skc_slab_alloc; /* Slab alloc */ + uint64_t skc_slab_max; /* Slab max */ + uint64_t skc_obj_total; /* Obj total */ + uint64_t skc_obj_alloc; /* Obj alloc */ + uint64_t skc_obj_max; /* Obj max */ + uint64_t skc_hash_depth; /* Hash depth */ + uint64_t skc_hash_max; /* Hash depth max */ +} spl_kmem_cache_t; + +extern spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); -extern int __kmem_cache_destroy(kmem_cache_t *cache); -extern void *__kmem_cache_alloc(kmem_cache_t *cache, gfp_t flags); -extern void __kmem_cache_free(kmem_cache_t *cache, void *obj); -extern void __kmem_reap(void); +extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); +extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); +extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc); +extern void spl_kmem_reap(void); -int kmem_init(void); -void kmem_fini(void); +int spl_kmem_init(void); +void spl_kmem_fini(void); #define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ - __kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) -#define kmem_cache_destroy(cache) __kmem_cache_destroy(cache) -#define kmem_cache_alloc(cache, flags) __kmem_cache_alloc(cache, flags) -#define kmem_cache_free(cache, obj) __kmem_cache_free(cache, obj) -#define kmem_cache_reap_now(cache) kmem_cache_shrink(cache) -#define kmem_reap() __kmem_reap() + spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) +#define kmem_reap() spl_kmem_reap() #ifdef __cplusplus } diff --git a/include/sys/types.h b/include/sys/types.h index c2ad9837d..c60bfb208 100644 --- a/include/sys/types.h +++ b/include/sys/types.h @@ -16,6 +16,7 @@ typedef unsigned long uintptr_t; #ifndef HAVE_KMEM_CACHE_T typedef struct kmem_cache kmem_cache_t; #endif +#define kmem_cache_t spl_kmem_cache_t typedef enum { B_FALSE=0, B_TRUE=1 } boolean_t; typedef unsigned long intptr_t; diff --git a/modules/spl/spl-generic.c b/modules/spl/spl-generic.c index 3027c7108..7a073ee52 100644 --- a/modules/spl/spl-generic.c +++ b/modules/spl/spl-generic.c @@ -130,7 +130,7 @@ static int __init spl_init(void) if ((rc = debug_init())) return rc; - if ((rc = kmem_init())) + if ((rc = spl_kmem_init())) GOTO(out , rc); if ((rc = spl_mutex_init())) @@ -159,7 +159,7 @@ out4: out3: spl_mutex_fini(); out2: - kmem_fini(); + spl_kmem_fini(); out: debug_fini(); @@ -176,7 +176,8 @@ static void spl_fini(void) kstat_fini(); proc_fini(); vn_fini(); - kmem_fini(); + spl_mutex_fini(); + spl_kmem_fini(); debug_fini(); } diff --git a/modules/spl/spl-kmem.c b/modules/spl/spl-kmem.c index d7643067a..e52f19935 100644 --- a/modules/spl/spl-kmem.c +++ b/modules/spl/spl-kmem.c @@ -33,7 +33,13 @@ #define DEBUG_SUBSYSTEM S_KMEM /* - * Memory allocation interfaces + * Memory allocation interfaces and debugging for basic kmem_* + * and vmem_* style memory allocation. When DEBUG_KMEM is enable + * all allocations will be tracked when they are allocated and + * freed. When the SPL module is unload a list of all leaked + * addresses and where they were allocated will be dumped to the + * console. Enabling this feature has a significant impant on + * performance but it makes finding memory leaks staight forward. */ #ifdef DEBUG_KMEM /* Shim layer memory accounting */ @@ -75,477 +81,590 @@ EXPORT_SYMBOL(kmem_set_warning); /* * Slab allocation interfaces * - * While the linux slab implementation was inspired by solaris they - * have made some changes to the API which complicates this shim - * layer. For one thing the same symbol names are used with different - * arguments for the prototypes. To deal with this we must use the - * preprocessor to re-order arguments. Happily for us standard C says, - * "Macro's appearing in their own expansion are not reexpanded" so - * this does not result in an infinite recursion. Additionally the - * function pointers registered by solarias differ from those used - * by linux so a lookup and mapping from linux style callback to a - * solaris style callback is needed. There is some overhead in this - * operation which isn't horibile but it needs to be kept in mind. + * While the Linux slab implementation was inspired by the Solaris + * implemenation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleaner for these data types unlike + * may Linux data type which do need to be explicitly destroyed. + * + * 2) Virtual address backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contigeous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + * + * XXX: Refactor the below code in to smaller functions. This works + * for a first pass but each function is doing to much. + * + * XXX: Implement SPL proc interface to export full per cache stats. + * + * XXX: Implement work requests to keep an eye on each cache and + * shrink them via slab_reclaim() when they are wasting lots + * of space. Currently this process is driven by the reapers. + * + * XXX: Implement proper small cache object support by embedding + * the spl_kmem_slab_t, spl_kmem_obj_t's, and objects in the + * allocated for a particular slab. + * + * XXX: Implement a resizable used object hash. Currently the hash + * is statically sized for thousands of objects but it should + * grow based on observed worst case slab depth. + * + * XXX: Improve the partial slab list by carefully maintaining a + * strict ordering of fullest to emptiest slabs based on + * the slab reference count. This gaurentees the when freeing + * slabs back to the system we need only linearly traverse the + * last N slabs in the list to discover all the freeable slabs. + * + * XXX: NUMA awareness for optionally allocating memory close to a + * particular core. This can be adventageous if you know the slab + * object will be short lived and primarily accessed from one core. + * + * XXX: Slab coloring may also yield performance improvements and would + * be desirable to implement. */ -#define KCC_MAGIC 0x7a7a7a7a -#define KCC_POISON 0x77 - -typedef struct kmem_cache_cb { - int kcc_magic; - struct hlist_node kcc_hlist; - struct list_head kcc_list; - kmem_cache_t * kcc_cache; - kmem_constructor_t kcc_constructor; - kmem_destructor_t kcc_destructor; - kmem_reclaim_t kcc_reclaim; - void * kcc_private; - void * kcc_vmp; - atomic_t kcc_ref; -} kmem_cache_cb_t; - -#define KMEM_CACHE_HASH_BITS 10 -#define KMEM_CACHE_TABLE_SIZE (1 << KMEM_CACHE_HASH_BITS) - -struct hlist_head kmem_cache_table[KMEM_CACHE_TABLE_SIZE]; -struct list_head kmem_cache_list; -static struct rw_semaphore kmem_cache_sem; + +/* Ensure the __kmem_cache_create/__kmem_cache_destroy macros are + * removed here to prevent a recursive substitution, we want to call + * the native linux version. + */ +#undef kmem_cache_t +#undef kmem_cache_create +#undef kmem_cache_destroy +#undef kmem_cache_alloc +#undef kmem_cache_free + +static struct list_head spl_kmem_cache_list; /* List of caches */ +static struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +static kmem_cache_t *spl_slab_cache; /* Cache for slab structs */ +static kmem_cache_t *spl_obj_cache; /* Cache for obj structs */ #ifdef HAVE_SET_SHRINKER -static struct shrinker *kmem_cache_shrinker; +static struct shrinker *spl_kmem_cache_shrinker; #else static int kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask); -static struct shrinker kmem_cache_shrinker = { +static struct shrinker spl_kmem_cache_shrinker = { .shrink = kmem_cache_generic_shrinker, .seeks = KMC_DEFAULT_SEEKS, }; #endif -/* Function must be called while holding the kmem_cache_sem - * Because kmem_cache_t is an opaque datatype we're forced to - * match pointers to identify specific cache entires. - */ -static kmem_cache_cb_t * -kmem_cache_find_cache_cb(kmem_cache_t *cache) -{ - struct hlist_head *head; - struct hlist_node *node; - kmem_cache_cb_t *kcc; -#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK - ASSERT(rwsem_is_locked(&kmem_cache_sem)); -#endif - - head = &kmem_cache_table[hash_ptr(cache, KMEM_CACHE_HASH_BITS)]; - hlist_for_each_entry_rcu(kcc, node, head, kcc_hlist) - if (kcc->kcc_cache == cache) - return kcc; +static spl_kmem_slab_t * +slab_alloc(spl_kmem_cache_t *skc, int flags) { + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + int i; + ENTRY; - return NULL; -} + sks = kmem_cache_alloc(spl_slab_cache, flags); + if (sks == NULL) + RETURN(sks); + + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + atomic_set(&sks->sks_ref, 0); + + for (i = 0; i < sks->sks_objs; i++) { + sko = kmem_cache_alloc(spl_obj_cache, flags); + if (sko == NULL) { +out_alloc: + /* Unable to fully construct slab, objects, + * and object data buffers unwind everything. + */ + list_for_each_entry_safe(sko, n, &sks->sks_free_list, + sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + vmem_free(sko->sko_addr, skc->skc_obj_size); + list_del(&sko->sko_list); + kmem_cache_free(spl_obj_cache, sko); + } + + kmem_cache_free(spl_slab_cache, sks); + GOTO(out, sks = NULL); + } -static kmem_cache_cb_t * -kmem_cache_add_cache_cb(kmem_cache_t *cache, - kmem_constructor_t constructor, - kmem_destructor_t destructor, - kmem_reclaim_t reclaim, - void *priv, void *vmp) -{ - kmem_cache_cb_t *kcc; - - kcc = (kmem_cache_cb_t *)kmalloc(sizeof(*kcc), GFP_KERNEL); - if (kcc) { - kcc->kcc_magic = KCC_MAGIC; - kcc->kcc_cache = cache; - kcc->kcc_constructor = constructor; - kcc->kcc_destructor = destructor; - kcc->kcc_reclaim = reclaim; - kcc->kcc_private = priv; - kcc->kcc_vmp = vmp; - atomic_set(&kcc->kcc_ref, 0); - down_write(&kmem_cache_sem); - hlist_add_head_rcu(&kcc->kcc_hlist, &kmem_cache_table[ - hash_ptr(cache, KMEM_CACHE_HASH_BITS)]); - list_add_tail(&kcc->kcc_list, &kmem_cache_list); - up_write(&kmem_cache_sem); - } - - return kcc; -} + sko->sko_addr = vmem_alloc(skc->skc_obj_size, flags); + if (sko->sko_addr == NULL) { + kmem_cache_free(spl_obj_cache, sko); + GOTO(out_alloc, sks = NULL); + } -static void -kmem_cache_remove_cache_cb(kmem_cache_cb_t *kcc) -{ - down_write(&kmem_cache_sem); - ASSERT(atomic_read(&kcc->kcc_ref) == 0); - hlist_del_init(&kcc->kcc_hlist); - list_del_init(&kcc->kcc_list); - up_write(&kmem_cache_sem); - - if (kcc) { - memset(kcc, KCC_POISON, sizeof(*kcc)); - kfree(kcc); + sko->sko_magic = SKO_MAGIC; + sko->sko_flags = 0; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + INIT_HLIST_NODE(&sko->sko_hlist); + list_add(&sko->sko_list, &sks->sks_free_list); } +out: + RETURN(sks); } -#ifdef HAVE_3ARG_KMEM_CACHE_CREATE_CTOR +/* Removes slab from complete or partial list, so it must + * be called with the 'skc->skc_sem' semaphore held. + * */ static void -kmem_cache_generic_constructor(void *ptr, kmem_cache_t *cache, - unsigned long flags) -{ - kmem_cache_cb_t *kcc; - kmem_constructor_t constructor; - void *private; - - /* Ensure constructor verifies are not passed to the registered - * constructors. This may not be safe due to the Solaris constructor - * not being aware of how to handle the SLAB_CTOR_VERIFY flag - */ - ASSERT(flags & SLAB_CTOR_CONSTRUCTOR); +slab_free(spl_kmem_slab_t *sks) { + spl_kmem_cache_t *skc; + spl_kmem_obj_t *sko, *n; + int i = 0; + ENTRY; - if (flags & SLAB_CTOR_VERIFY) - return; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(atomic_read(&sks->sks_ref) == 0); + skc = sks->sks_cache; + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; - if (flags & SLAB_CTOR_ATOMIC) - flags = KM_NOSLEEP; - else - flags = KM_SLEEP; -#else -static void -kmem_cache_generic_constructor(kmem_cache_t *cache, void *ptr) -{ - kmem_cache_cb_t *kcc; - kmem_constructor_t constructor; - void *private; - int flags = KM_NOSLEEP; +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + ASSERT(rwsem_is_locked(&skc->skc_sem)); #endif - /* We can be called with interrupts disabled so it is critical that - * this function and the registered constructor never sleep. - */ - while (!down_read_trylock(&kmem_cache_sem)); - /* Callback list must be in sync with linux slab caches */ - kcc = kmem_cache_find_cache_cb(cache); - ASSERT(kcc); - ASSERT(kcc->kcc_magic == KCC_MAGIC); - atomic_inc(&kcc->kcc_ref); + list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); - constructor = kcc->kcc_constructor; - private = kcc->kcc_private; + /* Run destructors for being freed */ + if (skc->skc_dtor) + skc->skc_dtor(sko->sko_addr, skc->skc_private); - up_read(&kmem_cache_sem); + vmem_free(sko->sko_addr, skc->skc_obj_size); + list_del(&sko->sko_list); + kmem_cache_free(spl_obj_cache, sko); + i++; + } - if (constructor) - constructor(ptr, private, (int)flags); + ASSERT(sks->sks_objs == i); + list_del(&sks->sks_list); + kmem_cache_free(spl_slab_cache, sks); - atomic_dec(&kcc->kcc_ref); + EXIT; +} - /* Linux constructor has no return code, silently eat it */ +static int +__slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + int rc = 0; + ENTRY; + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + ASSERT(rwsem_is_locked(&skc->skc_sem)); +#endif + /* + * Free empty slabs which have not been touched in skc_delay + * seconds. This delay time is important to avoid thrashing. + * Empty slabs will be at the end of the skc_partial_list. + */ + list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, + sks_list) { + if (atomic_read(&sks->sks_ref) > 0) + break; + + if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) { + slab_free(sks); + rc++; + } + } + + /* Returns number of slabs reclaimed */ + RETURN(rc); } -static void -kmem_cache_generic_destructor(void *ptr, kmem_cache_t *cache, unsigned long flags) +static int +slab_reclaim(spl_kmem_cache_t *skc) { - kmem_cache_cb_t *kcc; - kmem_destructor_t destructor; - void *private; + int rc; + ENTRY; - /* No valid destructor flags */ - ASSERT(flags == 0); + down_write(&skc->skc_sem); + rc = __slab_reclaim(skc); + up_write(&skc->skc_sem); - /* We can be called with interrupts disabled so it is critical that - * this function and the registered constructor never sleep. - */ - while (!down_read_trylock(&kmem_cache_sem)); + RETURN(rc); +} - /* Callback list must be in sync with linux slab caches */ - kcc = kmem_cache_find_cache_cb(cache); - ASSERT(kcc); - ASSERT(kcc->kcc_magic == KCC_MAGIC); - atomic_inc(&kcc->kcc_ref); +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, + spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + spl_kmem_cache_t *skc; + int i, kmem_flags = KM_SLEEP; + ENTRY; - destructor = kcc->kcc_destructor; - private = kcc->kcc_private; + /* We may be called when there is a non-zero preempt_count or + * interrupts are disabled is which case we must not sleep. + */ + if (current_thread_info()->preempt_count || irqs_disabled()) + kmem_flags = KM_NOSLEEP; - up_read(&kmem_cache_sem); + /* Allocate new cache memory and initialize. */ + skc = (spl_kmem_cache_t *)kmem_alloc(sizeof(*skc), kmem_flags); + if (skc == NULL) + RETURN(NULL); - /* Solaris destructor takes no flags, silently eat them */ - if (destructor) - destructor(ptr, private); + skc->skc_magic = SKC_MAGIC; - atomic_dec(&kcc->kcc_ref); + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags); + if (skc->skc_name == NULL) { + kmem_free(skc, sizeof(*skc)); + RETURN(NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_chunk_size = 0; /* XXX: Needed only when implementing */ + skc->skc_slab_size = 0; /* small slab object optimizations */ + skc->skc_max_chunks = 0; /* which are yet supported. */ + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + + skc->skc_hash_bits = SPL_KMEM_CACHE_HASH_BITS; + skc->skc_hash_size = SPL_KMEM_CACHE_HASH_SIZE; + skc->skc_hash_elts = SPL_KMEM_CACHE_HASH_ELTS; + skc->skc_hash = (struct hlist_head *) + kmem_alloc(skc->skc_hash_size, kmem_flags); + if (skc->skc_hash == NULL) { + kmem_free(skc->skc_name, skc->skc_name_size); + kmem_free(skc, sizeof(*skc)); + } + + for (i = 0; i < skc->skc_hash_elts; i++) + INIT_HLIST_HEAD(&skc->skc_hash[i]); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + init_rwsem(&skc->skc_sem); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_hash_depth = 0; + skc->skc_hash_max = 0; + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + RETURN(skc); } +EXPORT_SYMBOL(spl_kmem_cache_create); -/* Arguments are ignored */ -static int -kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask) +/* The caller must ensure there are no racing calls to + * spl_kmem_cache_alloc() for this spl_kmem_cache_t when + * it is being destroyed. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { - kmem_cache_cb_t *kcc; - int total = 0; + spl_kmem_slab_t *sks, *m; + ENTRY; - /* Under linux a shrinker is not tightly coupled with a slab - * cache. In fact linux always systematically trys calling all - * registered shrinker callbacks until its target reclamation level - * is reached. Because of this we only register one shrinker - * function in the shim layer for all slab caches. And we always - * attempt to shrink all caches when this generic shrinker is called. - */ - down_read(&kmem_cache_sem); - - list_for_each_entry(kcc, &kmem_cache_list, kcc_list) { - ASSERT(kcc); - ASSERT(kcc->kcc_magic == KCC_MAGIC); - - /* Take a reference on the cache in question. If that - * cache is contended simply skip it, it may already be - * in the process of a reclaim or the ctor/dtor may be - * running in either case it's best to skip it. - */ - atomic_inc(&kcc->kcc_ref); - if (atomic_read(&kcc->kcc_ref) > 1) { - atomic_dec(&kcc->kcc_ref); - continue; - } + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + down_write(&skc->skc_sem); - /* Under linux the desired number and gfp type of objects - * is passed to the reclaiming function as a sugested reclaim - * target. I do not pass these args on because reclaim - * policy is entirely up to the owner under solaris. We only - * pass on the pre-registered private data. - */ - if (kcc->kcc_reclaim) - kcc->kcc_reclaim(kcc->kcc_private); - - atomic_dec(&kcc->kcc_ref); - total += 1; - } - - /* Under linux we should return the remaining number of entires in - * the cache. Unfortunately, I don't see an easy way to safely - * emulate this behavior so I'm returning one entry per cache which - * was registered with the generic shrinker. This should fake out - * the linux VM when it attempts to shrink caches. + /* Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ - up_read(&kmem_cache_sem); + ASSERT(list_empty(&skc->skc_complete_list)); - return total; + list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list) + slab_free(sks); + + kmem_free(skc->skc_hash, skc->skc_hash_size); + kmem_free(skc->skc_name, skc->skc_name_size); + kmem_free(skc, sizeof(*skc)); + up_write(&skc->skc_sem); + + EXIT; } +EXPORT_SYMBOL(spl_kmem_cache_destroy); -/* Ensure the __kmem_cache_create/__kmem_cache_destroy macros are - * removed here to prevent a recursive substitution, we want to call - * the native linux version. +/* The kernel provided hash_ptr() function behaves exceptionally badly + * when all the addresses are page aligned which is likely the case + * here. To avoid this issue shift off the low order non-random bits. */ -#undef kmem_cache_create -#undef kmem_cache_destroy -#undef kmem_cache_alloc -#undef kmem_cache_free +static unsigned long +spl_hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr >> PAGE_SHIFT, bits); +} -kmem_cache_t * -__kmem_cache_create(char *name, size_t size, size_t align, - kmem_constructor_t constructor, - kmem_destructor_t destructor, - kmem_reclaim_t reclaim, - void *priv, void *vmp, int flags) +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) { - kmem_cache_t *cache; - kmem_cache_cb_t *kcc; - int shrinker_flag = 0; - char *cache_name; + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko; + void *obj; + unsigned long key; ENTRY; - /* XXX: - Option currently unsupported by shim layer */ - ASSERT(!vmp); - ASSERT(flags == 0); + down_write(&skc->skc_sem); +restart: + /* Check for available objects from the partial slabs */ + if (!list_empty(&skc->skc_partial_list)) { + sks = list_first_entry(&skc->skc_partial_list, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(atomic_read(&sks->sks_ref) < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + sko = list_first_entry(&sks->sks_free_list, + spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list, add to used hash */ + list_del_init(&sko->sko_list); + key = spl_hash_ptr(sko->sko_addr, skc->skc_hash_bits); + hlist_add_head_rcu(&sko->sko_hlist, &skc->skc_hash[key]); + + sks->sks_age = jiffies; + atomic_inc(&sks->sks_ref); + skc->skc_obj_alloc++; + + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + if (atomic_read(&sks->sks_ref) == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } - cache_name = kzalloc(strlen(name) + 1, GFP_KERNEL); - if (cache_name == NULL) - RETURN(NULL); + /* Move slab to skc_complete_list when full */ + if (atomic_read(&sks->sks_ref) == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + + GOTO(out_lock, obj = sko->sko_addr); + } + + up_write(&skc->skc_sem); - strcpy(cache_name, name); - - /* When your slab is implemented in terms of the slub it - * is possible similarly sized slab caches will be merged. - * For our implementation we must make sure this never - * happens because we require a unique cache address to - * use as a hash key when looking up the constructor, - * destructor, and shrinker registered for each unique - * type of slab cache. Passing any of the following flags - * will prevent the slub merging. - * - * SLAB_RED_ZONE - * SLAB_POISON - * SLAB_STORE_USER - * SLAB_TRACE - * SLAB_DESTROY_BY_RCU + /* No available objects create a new slab. Since this is an + * expensive operation we do it without holding the semaphore + * and only briefly aquire it when we link in the fully + * allocated and constructed slab. */ -#ifdef CONFIG_SLUB - flags |= SLAB_STORE_USER; -#endif -#ifdef HAVE_KMEM_CACHE_CREATE_DTOR - cache = kmem_cache_create(cache_name, size, align, flags, - kmem_cache_generic_constructor, - kmem_cache_generic_destructor); -#else - cache = kmem_cache_create(cache_name, size, align, flags, NULL); -#endif - if (cache == NULL) - RETURN(NULL); + /* Under Solaris if the KM_SLEEP flag is passed we may never + * fail, so sleep as long as needed. Additionally, since we are + * using vmem_alloc() KM_NOSLEEP is not an option and we must + * fail. Shifting to allocating our own pages and mapping the + * virtual address space may allow us to bypass this issue. + */ + if (!flags) + flags |= KM_SLEEP; - /* Register shared shrinker function on initial cache create */ - down_read(&kmem_cache_sem); - if (list_empty(&kmem_cache_list)) { -#ifdef HAVE_SET_SHRINKER - kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS, - kmem_cache_generic_shrinker); - if (kmem_cache_shrinker == NULL) { - kmem_cache_destroy(cache); - up_read(&kmem_cache_sem); - RETURN(NULL); - } -#else - register_shrinker(&kmem_cache_shrinker); -#endif - } - up_read(&kmem_cache_sem); + if (flags & KM_SLEEP) + flags |= __GFP_NOFAIL; + else + GOTO(out, obj = NULL); - kcc = kmem_cache_add_cache_cb(cache, constructor, destructor, - reclaim, priv, vmp); - if (kcc == NULL) { - if (shrinker_flag) /* New shrinker registered must be removed */ -#ifdef HAVE_SET_SHRINKER - remove_shrinker(kmem_cache_shrinker); -#else - unregister_shrinker(&kmem_cache_shrinker); -#endif + sks = slab_alloc(skc, flags); + if (sks == NULL) + GOTO(out, obj = NULL); + + /* Run all the constructors now that the slab is fully allocated */ + list_for_each_entry(sko, &sks->sks_free_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); - kmem_cache_destroy(cache); - RETURN(NULL); - } + if (skc->skc_ctor) + skc->skc_ctor(sko->sko_addr, skc->skc_private, flags); + } - RETURN(cache); + /* Link the newly created slab in to the skc_partial_list, + * and retry the allocation which will now succeed. + */ + down_write(&skc->skc_sem); + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + GOTO(restart, obj = NULL); + +out_lock: + up_write(&skc->skc_sem); +out: + RETURN(obj); } -EXPORT_SYMBOL(__kmem_cache_create); +EXPORT_SYMBOL(spl_kmem_cache_alloc); -/* Return code provided despite Solaris's void return. There should be no - * harm here since the Solaris versions will ignore it anyway. */ -int -__kmem_cache_destroy(kmem_cache_t *cache) +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { - kmem_cache_cb_t *kcc; - char *name; - int rc; + struct hlist_head *head; + struct hlist_node *node; + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; ENTRY; - down_read(&kmem_cache_sem); - kcc = kmem_cache_find_cache_cb(cache); - if (kcc == NULL) { - up_read(&kmem_cache_sem); - RETURN(-EINVAL); - } - atomic_inc(&kcc->kcc_ref); - up_read(&kmem_cache_sem); + down_write(&skc->skc_sem); - name = (char *)kmem_cache_name(cache); + head = &skc->skc_hash[spl_hash_ptr(obj, skc->skc_hash_bits)]; + hlist_for_each_entry_rcu(sko, node, head, sko_hlist) { + if (sko->sko_addr == obj) { + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + break; + } + } -#ifdef HAVE_KMEM_CACHE_DESTROY_INT - rc = kmem_cache_destroy(cache); -#else - kmem_cache_destroy(cache); - rc = 0; -#endif + ASSERT(sko != NULL); /* Obj must be in hash */ + ASSERT(sks != NULL); /* Obj must reference slab */ + ASSERT(sks->sks_cache == skc); + hlist_del_init(&sko->sko_hlist); + list_add(&sko->sko_list, &sks->sks_free_list); - atomic_dec(&kcc->kcc_ref); - kmem_cache_remove_cache_cb(kcc); - kfree(name); + sks->sks_age = jiffies; + atomic_dec(&sks->sks_ref); + skc->skc_obj_alloc--; - /* Unregister generic shrinker on removal of all caches */ - down_read(&kmem_cache_sem); - if (list_empty(&kmem_cache_list)) -#ifdef HAVE_SET_SHRINKER - remove_shrinker(kmem_cache_shrinker); -#else - unregister_shrinker(&kmem_cache_shrinker); -#endif + /* Move slab to skc_partial_list when no longer full. Slabs + * are added to the kead to keep the partial list is quasi + * full sorted order. Fuller at the head, emptier at the tail. + */ + if (atomic_read(&sks->sks_ref) == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } - up_read(&kmem_cache_sem); - RETURN(rc); + /* Move emply slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (atomic_read(&sks->sks_ref) == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } + + __slab_reclaim(skc); + up_write(&skc->skc_sem); } -EXPORT_SYMBOL(__kmem_cache_destroy); - -/* Under Solaris if the KM_SLEEP flag is passed we absolutely must - * sleep until we are allocated the memory. Under Linux you can still - * get a memory allocation failure, so I'm forced to keep requesting - * the memory even if the system is under substantial memory pressure - * of fragmentation prevents the allocation from succeeded. This is - * not the correct fix, or even a good one. But it will do for now. - */ -void * -__kmem_cache_alloc(kmem_cache_t *cache, gfp_t flags) -{ - void *obj; - ENTRY; +EXPORT_SYMBOL(spl_kmem_cache_free); -restart: - obj = kmem_cache_alloc(cache, flags); - if ((obj == NULL) && (flags & KM_SLEEP)) { -#ifdef DEBUG_KMEM - atomic64_inc(&kmem_cache_alloc_failed); -#endif /* DEBUG_KMEM */ - GOTO(restart, obj); - } +static int +kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask) +{ + spl_kmem_cache_t *skc; - /* When destructor support is removed we must be careful not to - * use the provided constructor which will end up being called - * more often than the destructor which we only call on free. Thus - * we call the proper constructor when there is no destructor. + /* Under linux a shrinker is not tightly coupled with a slab + * cache. In fact linux always systematically trys calling all + * registered shrinker callbacks until its target reclamation level + * is reached. Because of this we only register one shrinker + * function in the shim layer for all slab caches. And we always + * attempt to shrink all caches when this generic shrinker is called. */ -#ifndef HAVE_KMEM_CACHE_CREATE_DTOR -#ifdef HAVE_3ARG_KMEM_CACHE_CREATE_CTOR - kmem_cache_generic_constructor(obj, cache, flags); -#else - kmem_cache_generic_constructor(cache, obj); -#endif /* HAVE_KMEM_CACHE_CREATE_DTOR */ -#endif /* HAVE_3ARG_KMEM_CACHE_CREATE_CTOR */ + down_read(&spl_kmem_cache_sem); - RETURN(obj); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) + spl_kmem_cache_reap_now(skc); + + up_read(&spl_kmem_cache_sem); + + /* XXX: Under linux we should return the remaining number of + * entries in the cache. We should do this as well. + */ + return 1; } -EXPORT_SYMBOL(__kmem_cache_alloc); void -__kmem_cache_free(kmem_cache_t *cache, void *obj) +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { -#ifndef HAVE_KMEM_CACHE_CREATE_DTOR - kmem_cache_generic_destructor(obj, cache, 0); -#endif - kmem_cache_free(cache, obj); + ENTRY; + ASSERT(skc && skc->skc_magic == SKC_MAGIC); + + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + + slab_reclaim(skc); + EXIT; } -EXPORT_SYMBOL(__kmem_cache_free); +EXPORT_SYMBOL(spl_kmem_cache_reap_now); void -__kmem_reap(void) +spl_kmem_reap(void) { - ENTRY; - /* Since there's no easy hook in to linux to force all the registered - * shrinkers to run we just run the ones registered for this shim */ kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL); - EXIT; } -EXPORT_SYMBOL(__kmem_reap); +EXPORT_SYMBOL(spl_kmem_reap); int -kmem_init(void) +spl_kmem_init(void) { - int i; + int rc = 0; ENTRY; - init_rwsem(&kmem_cache_sem); - INIT_LIST_HEAD(&kmem_cache_list); + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + + spl_slab_cache = NULL; + spl_obj_cache = NULL; + + spl_slab_cache = kmem_cache_create("spl_slab_cache", + sizeof(spl_kmem_slab_t), + 0, 0, NULL); + if (spl_slab_cache == NULL) + GOTO(out_cache, rc = -ENOMEM); - for (i = 0; i < KMEM_CACHE_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&kmem_cache_table[i]); + spl_obj_cache = kmem_cache_create("spl_obj_cache", + sizeof(spl_kmem_obj_t), + 0, 0, NULL); + if (spl_obj_cache == NULL) + GOTO(out_cache, rc = -ENOMEM); + +#ifdef HAVE_SET_SHRINKER + spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS, shrinker); + if (spl_kmem_cache_shrinker == NULL) + GOTO(out_cache, rc = -ENOMEM); +#else + register_shrinker(&spl_kmem_cache_shrinker); +#endif #ifdef DEBUG_KMEM + { int i; atomic64_set(&kmem_alloc_used, 0); atomic64_set(&vmem_alloc_used, 0); + atomic64_set(&kmem_cache_alloc_failed, 0); spin_lock_init(&kmem_lock); INIT_LIST_HEAD(&kmem_list); @@ -558,10 +677,18 @@ kmem_init(void) for (i = 0; i < VMEM_TABLE_SIZE; i++) INIT_HLIST_HEAD(&vmem_table[i]); - - atomic64_set(&kmem_cache_alloc_failed, 0); + } #endif - RETURN(0); + RETURN(rc); + +out_cache: + if (spl_obj_cache) + (void)kmem_cache_destroy(spl_obj_cache); + + if (spl_slab_cache) + (void)kmem_cache_destroy(spl_slab_cache); + + RETURN(rc); } #ifdef DEBUG_KMEM @@ -609,53 +736,61 @@ sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) #endif /* DEBUG_KMEM */ void -kmem_fini(void) +spl_kmem_fini(void) { - ENTRY; #ifdef DEBUG_KMEM - { - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - /* Display all unreclaimed memory addresses, including the - * allocation size and the first few bytes of what's located - * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. */ - if (atomic64_read(&kmem_alloc_used) != 0) - CWARN("kmem leaked %ld/%ld bytes\n", - atomic_read(&kmem_alloc_used), kmem_alloc_max); - - spin_lock_irqsave(&kmem_lock, flags); - if (!list_empty(&kmem_list)) - CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", - "address", "size", "data", "func", "line"); - - list_for_each_entry(kd, &kmem_list, kd_list) - CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", - kd->kd_addr, kd->kd_size, - sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(&kmem_lock, flags); - - if (atomic64_read(&vmem_alloc_used) != 0) - CWARN("vmem leaked %ld/%ld bytes\n", - atomic_read(&vmem_alloc_used), vmem_alloc_max); - - spin_lock_irqsave(&vmem_lock, flags); - if (!list_empty(&vmem_list)) - CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", - "address", "size", "data", "func", "line"); - - list_for_each_entry(kd, &vmem_list, kd_list) - CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", - kd->kd_addr, kd->kd_size, - sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(&vmem_lock, flags); - } + unsigned long flags; + kmem_debug_t *kd; + char str[17]; + + /* Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. */ + if (atomic64_read(&kmem_alloc_used) != 0) + CWARN("kmem leaked %ld/%ld bytes\n", + atomic_read(&kmem_alloc_used), kmem_alloc_max); + + spin_lock_irqsave(&kmem_lock, flags); + if (!list_empty(&kmem_list)) + CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", + "address", "size", "data", "func", "line"); + + list_for_each_entry(kd, &kmem_list, kd_list) + CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", + kd->kd_addr, kd->kd_size, + sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + + spin_unlock_irqrestore(&kmem_lock, flags); + + if (atomic64_read(&vmem_alloc_used) != 0) + CWARN("vmem leaked %ld/%ld bytes\n", + atomic_read(&vmem_alloc_used), vmem_alloc_max); + + spin_lock_irqsave(&vmem_lock, flags); + if (!list_empty(&vmem_list)) + CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n", + "address", "size", "data", "func", "line"); + + list_for_each_entry(kd, &vmem_list, kd_list) + CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n", + kd->kd_addr, kd->kd_size, + sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + + spin_unlock_irqrestore(&vmem_lock, flags); +#endif + ENTRY; + +#ifdef HAVE_SET_SHRINKER + remove_shrinker(spl_kmem_cache_shrinker); +#else + unregister_shrinker(&spl_kmem_cache_shrinker); #endif + + (void)kmem_cache_destroy(spl_obj_cache); + (void)kmem_cache_destroy(spl_slab_cache); + EXIT; } diff --git a/modules/spl/spl-vnode.c b/modules/spl/spl-vnode.c index 7ff35a8f6..f6dbc00c3 100644 --- a/modules/spl/spl-vnode.c +++ b/modules/spl/spl-vnode.c @@ -633,7 +633,7 @@ void vn_fini(void) { file_t *fp, *next_fp; - int rc, leaked = 0; + int leaked = 0; ENTRY; spin_lock(&vn_file_lock); @@ -644,19 +644,14 @@ vn_fini(void) leaked++; } - rc = kmem_cache_destroy(vn_file_cache); - if (rc) - CWARN("Warning leaked vn_file_cache objects, %d\n", rc); - + kmem_cache_destroy(vn_file_cache); vn_file_cache = NULL; spin_unlock(&vn_file_lock); if (leaked > 0) CWARN("Warning %d files leaked\n", leaked); - rc = kmem_cache_destroy(vn_cache); - if (rc) - CWARN("Warning leaked vn_cache objects, %d\n", rc); + kmem_cache_destroy(vn_cache); EXIT; return; diff --git a/modules/splat/splat-kmem.c b/modules/splat/splat-kmem.c index 277a9afe0..7342052c1 100644 --- a/modules/splat/splat-kmem.c +++ b/modules/splat/splat-kmem.c @@ -39,16 +39,24 @@ #define SPLAT_KMEM_TEST2_DESC "Memory allocation test (kmem_zalloc)" #define SPLAT_KMEM_TEST3_ID 0x0103 -#define SPLAT_KMEM_TEST3_NAME "slab_alloc" -#define SPLAT_KMEM_TEST3_DESC "Slab constructor/destructor test" +#define SPLAT_KMEM_TEST3_NAME "vmem_alloc" +#define SPLAT_KMEM_TEST3_DESC "Memory allocation test (vmem_alloc)" #define SPLAT_KMEM_TEST4_ID 0x0104 -#define SPLAT_KMEM_TEST4_NAME "slab_reap" -#define SPLAT_KMEM_TEST4_DESC "Slab reaping test" +#define SPLAT_KMEM_TEST4_NAME "vmem_zalloc" +#define SPLAT_KMEM_TEST4_DESC "Memory allocation test (vmem_zalloc)" #define SPLAT_KMEM_TEST5_ID 0x0105 -#define SPLAT_KMEM_TEST5_NAME "vmem_alloc" -#define SPLAT_KMEM_TEST5_DESC "Memory allocation test (vmem_alloc)" +#define SPLAT_KMEM_TEST5_NAME "kmem_cache1" +#define SPLAT_KMEM_TEST5_DESC "Slab ctor/dtor test (small)" + +#define SPLAT_KMEM_TEST6_ID 0x0106 +#define SPLAT_KMEM_TEST6_NAME "kmem_cache2" +#define SPLAT_KMEM_TEST6_DESC "Slab ctor/dtor test (large)" + +#define SPLAT_KMEM_TEST7_ID 0x0107 +#define SPLAT_KMEM_TEST7_NAME "kmem_reap" +#define SPLAT_KMEM_TEST7_DESC "Slab reaping test" #define SPLAT_KMEM_ALLOC_COUNT 10 #define SPLAT_VMEM_ALLOC_COUNT 10 @@ -142,16 +150,91 @@ splat_kmem_test2(struct file *file, void *arg) return rc; } +static int +splat_kmem_test3(struct file *file, void *arg) +{ + void *ptr[SPLAT_VMEM_ALLOC_COUNT]; + int size = PAGE_SIZE; + int i, count, rc = 0; + + while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + count = 0; + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { + ptr[i] = vmem_alloc(size, KM_SLEEP); + if (ptr[i]) + count++; + } + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) + if (ptr[i]) + vmem_free(ptr[i], size); + + splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + "%d byte allocations, %d/%d successful\n", + size, count, SPLAT_VMEM_ALLOC_COUNT); + if (count != SPLAT_VMEM_ALLOC_COUNT) + rc = -ENOMEM; + + size *= 2; + } + + return rc; +} + +static int +splat_kmem_test4(struct file *file, void *arg) +{ + void *ptr[SPLAT_VMEM_ALLOC_COUNT]; + int size = PAGE_SIZE; + int i, j, count, rc = 0; + + while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + count = 0; + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { + ptr[i] = vmem_zalloc(size, KM_SLEEP); + if (ptr[i]) + count++; + } + + /* Ensure buffer has been zero filled */ + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { + for (j = 0; j < size; j++) { + if (((char *)ptr[i])[j] != '\0') { + splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + "%d-byte allocation was " + "not zeroed\n", size); + rc = -EFAULT; + } + } + } + + for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) + if (ptr[i]) + vmem_free(ptr[i], size); + + splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + "%d byte allocations, %d/%d successful\n", + size, count, SPLAT_VMEM_ALLOC_COUNT); + if (count != SPLAT_VMEM_ALLOC_COUNT) + rc = -ENOMEM; + + size *= 2; + } + + return rc; +} + #define SPLAT_KMEM_TEST_MAGIC 0x004488CCUL #define SPLAT_KMEM_CACHE_NAME "kmem_test" -#define SPLAT_KMEM_CACHE_SIZE 256 #define SPLAT_KMEM_OBJ_COUNT 128 -#define SPLAT_KMEM_OBJ_RECLAIM 64 +#define SPLAT_KMEM_OBJ_RECLAIM 16 typedef struct kmem_cache_data { - char kcd_buf[SPLAT_KMEM_CACHE_SIZE]; unsigned long kcd_magic; int kcd_flag; + char kcd_buf[0]; } kmem_cache_data_t; typedef struct kmem_cache_priv { @@ -159,48 +242,52 @@ typedef struct kmem_cache_priv { struct file *kcp_file; kmem_cache_t *kcp_cache; kmem_cache_data_t *kcp_kcd[SPLAT_KMEM_OBJ_COUNT]; + int kcp_size; int kcp_count; int kcp_rc; } kmem_cache_priv_t; static int -splat_kmem_test34_constructor(void *ptr, void *priv, int flags) +splat_kmem_cache_test_constructor(void *ptr, void *priv, int flags) { - kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; + kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; if (kcd) { - memset(kcd->kcd_buf, 0xaa, SPLAT_KMEM_CACHE_SIZE); - kcd->kcd_flag = 1; - if (kcp) { kcd->kcd_magic = kcp->kcp_magic; kcp->kcp_count++; } + + memset(kcd->kcd_buf, 0xaa, kcp->kcp_size - (sizeof *kcd)); + kcd->kcd_flag = 1; } return 0; } static void -splat_kmem_test34_destructor(void *ptr, void *priv) +splat_kmem_cache_test_destructor(void *ptr, void *priv) { - kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; + kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; if (kcd) { - memset(kcd->kcd_buf, 0xbb, SPLAT_KMEM_CACHE_SIZE); - kcd->kcd_flag = 0; - - if (kcp) + if (kcp) { + kcd->kcd_magic = 0; kcp->kcp_count--; + } + + memset(kcd->kcd_buf, 0xbb, kcp->kcp_size - (sizeof *kcd)); + kcd->kcd_flag = 0; } return; } static int -splat_kmem_test3(struct file *file, void *arg) +splat_kmem_cache_size_test(struct file *file, void *arg, + char *name, int size, int flags) { kmem_cache_t *cache = NULL; kmem_cache_data_t *kcd = NULL; @@ -209,22 +296,23 @@ splat_kmem_test3(struct file *file, void *arg) kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC; kcp.kcp_file = file; + kcp.kcp_size = size; kcp.kcp_count = 0; kcp.kcp_rc = 0; - cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, sizeof(*kcd), 0, - splat_kmem_test34_constructor, - splat_kmem_test34_destructor, - NULL, &kcp, NULL, 0); + cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp.kcp_size, 0, + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + NULL, &kcp, NULL, flags); if (!cache) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); return -ENOMEM; } - kcd = kmem_cache_alloc(cache, 0); + kcd = kmem_cache_alloc(cache, KM_SLEEP); if (!kcd) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Unable to allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; @@ -232,7 +320,7 @@ splat_kmem_test3(struct file *file, void *arg) } if (!kcd->kcd_flag) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Failed to run contructor for '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; @@ -240,7 +328,7 @@ splat_kmem_test3(struct file *file, void *arg) } if (kcd->kcd_magic != kcp.kcp_magic) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Failed to pass private data to constructor " "for '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; @@ -248,23 +336,20 @@ splat_kmem_test3(struct file *file, void *arg) } max = kcp.kcp_count; - - /* Destructor's run lazily so it hard to check correctness here. - * We assume if it doesn't crash the free worked properly */ kmem_cache_free(cache, kcd); /* Destroy the entire cache which will force destructors to * run and we can verify one was called for every object */ kmem_cache_destroy(cache); if (kcp.kcp_count) { - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, + splat_vprint(file, name, "Failed to run destructor on all slab objects " "for '%s'\n", SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; } - splat_vprint(file, SPLAT_KMEM_TEST3_NAME, - "%d allocated/destroyed objects for '%s'\n", + splat_vprint(file, name, + "Successfully ran ctors/dtors for %d elements in '%s'\n", max, SPLAT_KMEM_CACHE_NAME); return rc; @@ -277,19 +362,38 @@ out_free: return rc; } +static int +splat_kmem_test5(struct file *file, void *arg) +{ + return splat_kmem_cache_size_test(file, arg, SPLAT_KMEM_TEST5_NAME, + sizeof(kmem_cache_data_t) * 1, 0); +} + +static int +splat_kmem_test6(struct file *file, void *arg) +{ + return splat_kmem_cache_size_test(file, arg, SPLAT_KMEM_TEST6_NAME, + sizeof(kmem_cache_data_t) * 1024, 0); +} + static void -splat_kmem_test4_reclaim(void *priv) +splat_kmem_cache_test_reclaim(void *priv) { kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; - int i; + int i, count; + + count = min(SPLAT_KMEM_OBJ_RECLAIM, kcp->kcp_count); + splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST7_NAME, + "Reaping %d objects from '%s'\n", count, + SPLAT_KMEM_CACHE_NAME); - splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST4_NAME, - "Reaping %d objects from '%s'\n", - SPLAT_KMEM_OBJ_RECLAIM, SPLAT_KMEM_CACHE_NAME); - for (i = 0; i < SPLAT_KMEM_OBJ_RECLAIM; i++) { + for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) { if (kcp->kcp_kcd[i]) { kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]); kcp->kcp_kcd[i] = NULL; + + if (--count == 0) + break; } } @@ -297,24 +401,25 @@ splat_kmem_test4_reclaim(void *priv) } static int -splat_kmem_test4(struct file *file, void *arg) +splat_kmem_test7(struct file *file, void *arg) { kmem_cache_t *cache; kmem_cache_priv_t kcp; - int i, rc = 0, max, reclaim_percent, target_percent; + int i, rc = 0; kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC; kcp.kcp_file = file; + kcp.kcp_size = 256; kcp.kcp_count = 0; kcp.kcp_rc = 0; - cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, - sizeof(kmem_cache_data_t), 0, - splat_kmem_test34_constructor, - splat_kmem_test34_destructor, - splat_kmem_test4_reclaim, &kcp, NULL, 0); + cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp.kcp_size, 0, + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + splat_kmem_cache_test_reclaim, + &kcp, NULL, 0); if (!cache) { - splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); return -ENOMEM; } @@ -323,36 +428,57 @@ splat_kmem_test4(struct file *file, void *arg) for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) { /* All allocations need not succeed */ - kcp.kcp_kcd[i] = kmem_cache_alloc(cache, 0); + kcp.kcp_kcd[i] = kmem_cache_alloc(cache, KM_SLEEP); if (!kcp.kcp_kcd[i]) { - splat_vprint(file, SPLAT_KMEM_TEST4_NAME, + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, "Unable to allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); } } - max = kcp.kcp_count; - ASSERT(max > 0); - - /* Force shrinker to run */ - kmem_reap(); - - /* Reclaim reclaimed objects, this ensure the destructors are run */ - kmem_cache_reap_now(cache); - - reclaim_percent = ((kcp.kcp_count * 100) / max); - target_percent = (((SPLAT_KMEM_OBJ_COUNT - SPLAT_KMEM_OBJ_RECLAIM) * 100) / - SPLAT_KMEM_OBJ_COUNT); - splat_vprint(file, SPLAT_KMEM_TEST4_NAME, - "%d%% (%d/%d) of previous size, target of " - "%d%%-%d%% for '%s'\n", reclaim_percent, kcp.kcp_count, - max, target_percent - 10, target_percent + 10, - SPLAT_KMEM_CACHE_NAME); - if ((reclaim_percent < target_percent - 10) || - (reclaim_percent > target_percent + 10)) - rc = -EINVAL; + ASSERT(kcp.kcp_count > 0); + + /* Request the slab cache free any objects it can. For a few reasons + * this may not immediately result in more free memory even if objects + * are freed. First off, due to fragmentation we may not be able to + * reclaim any slabs. Secondly, even if we do we fully clear some + * slabs we will not want to immedately reclaim all of them because + * we may contend with cache allocs and thrash. What we want to see + * is slab size decrease more gradually as it becomes clear they + * will not be needed. This should be acheivable in less than minute + * if it takes longer than this something has gone wrong. + */ + for (i = 0; i < 60; i++) { + kmem_cache_reap_now(cache); + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, + "%s cache objects %d, slabs %u/%u objs %u/%u\n", + SPLAT_KMEM_CACHE_NAME, kcp.kcp_count, + (unsigned)cache->skc_slab_alloc, + (unsigned)cache->skc_slab_total, + (unsigned)cache->skc_obj_alloc, + (unsigned)cache->skc_obj_total); + + if (cache->skc_obj_total == 0) + break; + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + + if (cache->skc_obj_total == 0) { + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, + "Successfully created %d objects " + "in cache %s and reclaimed them\n", + SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME); + } else { + splat_vprint(file, SPLAT_KMEM_TEST7_NAME, + "Failed to reclaim %u/%d objects from cache %s\n", + (unsigned)cache->skc_obj_total, SPLAT_KMEM_OBJ_COUNT, + SPLAT_KMEM_CACHE_NAME); + rc = -ENOMEM; + } - /* Cleanup our mess */ + /* Cleanup our mess (for failure case of time expiring) */ for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) if (kcp.kcp_kcd[i]) kmem_cache_free(cache, kcp.kcp_kcd[i]); @@ -362,38 +488,6 @@ splat_kmem_test4(struct file *file, void *arg) return rc; } -static int -splat_kmem_test5(struct file *file, void *arg) -{ - void *ptr[SPLAT_VMEM_ALLOC_COUNT]; - int size = PAGE_SIZE; - int i, count, rc = 0; - - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { - count = 0; - - for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { - ptr[i] = vmem_alloc(size, KM_SLEEP); - if (ptr[i]) - count++; - } - - for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) - if (ptr[i]) - vmem_free(ptr[i], size); - - splat_vprint(file, SPLAT_KMEM_TEST5_NAME, - "%d byte allocations, %d/%d successful\n", - size, count, SPLAT_VMEM_ALLOC_COUNT); - if (count != SPLAT_VMEM_ALLOC_COUNT) - rc = -ENOMEM; - - size *= 2; - } - - return rc; -} - splat_subsystem_t * splat_kmem_init(void) { @@ -421,6 +515,10 @@ splat_kmem_init(void) SPLAT_KMEM_TEST4_ID, splat_kmem_test4); SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC, SPLAT_KMEM_TEST5_ID, splat_kmem_test5); + SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC, + SPLAT_KMEM_TEST6_ID, splat_kmem_test6); + SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC, + SPLAT_KMEM_TEST7_ID, splat_kmem_test7); return sub; } @@ -429,6 +527,8 @@ void splat_kmem_fini(splat_subsystem_t *sub) { ASSERT(sub); + SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID); + SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID); SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID); SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID); SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID); |