aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/kmem.h234
-rw-r--r--module/spl/spl-kmem.c248
2 files changed, 289 insertions, 193 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index 17b3a2276..e90c6b8ce 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -87,10 +87,10 @@ kzalloc_nofail(size_t size, gfp_t flags)
return ptr;
}
-#ifdef HAVE_KMALLOC_NODE
static inline void *
kmalloc_node_nofail(size_t size, gfp_t flags, int node)
{
+#ifdef HAVE_KMALLOC_NODE
void *ptr;
do {
@@ -98,16 +98,63 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node)
} while (ptr == NULL && (flags & __GFP_WAIT));
return ptr;
-}
+#else
+ return kmalloc_nofail(size, flags);
#endif /* HAVE_KMALLOC_NODE */
+}
+
+static inline void *
+vmalloc_nofail(size_t size, gfp_t flags)
+{
+ void *ptr;
+
+ /*
+ * Retry failed __vmalloc() allocations once every second. The
+ * rational for the delay is that the likely failure modes are:
+ *
+ * 1) The system has completely exhausted memory, in which case
+ * delaying 1 second for the memory reclaim to run is reasonable
+ * to avoid thrashing the system.
+ * 2) The system has memory but has exhausted the small virtual
+ * address space available on 32-bit systems. Retrying the
+ * allocation immediately will only result in spinning on the
+ * virtual address space lock. It is better delay a second and
+ * hope that another process will free some of the address space.
+ * But the bottom line is there is not much we can actually do
+ * since we can never safely return a failure and honor the
+ * Solaris semantics.
+ */
+ while (1) {
+ ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+ if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ } else {
+ break;
+ }
+ }
+
+ return ptr;
+}
+
+static inline void *
+vzalloc_nofail(size_t size, gfp_t flags)
+{
+ void *ptr;
+
+ ptr = vmalloc_nofail(size, flags);
+ if (ptr)
+ memset(ptr, 0, (size));
+
+ return ptr;
+}
#ifdef DEBUG_KMEM
-# ifdef HAVE_ATOMIC64_T
-extern atomic64_t kmem_alloc_used;
-extern unsigned long long kmem_alloc_max;
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
+/*
+ * Memory accounting functions to be used only when DEBUG_KMEM is set.
+ */
+# ifdef HAVE_ATOMIC64_T
# define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
@@ -118,13 +165,13 @@ extern unsigned long long vmem_alloc_max;
# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
-# else
-
-extern atomic_t kmem_alloc_used;
+extern atomic64_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
-extern atomic_t vmem_alloc_used;
+extern atomic64_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;
+# else /* HAVE_ATOMIC64_T */
+
# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
@@ -134,90 +181,107 @@ extern unsigned long long vmem_alloc_max;
# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
-# endif /* _LP64 */
-
-# define kmem_alloc(size, flags) __kmem_alloc((size), (flags), 0, 0)
-# define kmem_zalloc(size, flags) __kmem_alloc((size), ((flags) | \
- __GFP_ZERO), 0, 0)
-
-/* The node alloc functions are only used by the SPL code itself */
-# ifdef HAVE_KMALLOC_NODE
-# define kmem_alloc_node(size, flags, node) __kmem_alloc((size), (flags), 1, \
- node)
-# else
-# define kmem_alloc_node(size, flags, node) __kmem_alloc((size), (flags), 0, 0)
-# endif
+extern atomic_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+extern atomic_t vmem_alloc_used;
+extern unsigned long long vmem_alloc_max;
-# define vmem_zalloc(size, flags) vmem_alloc((size), ((flags) | \
- __GFP_ZERO))
+# endif /* HAVE_ATOMIC64_T */
# ifdef DEBUG_KMEM_TRACKING
-
-extern void *kmem_alloc_track(size_t size, int flags, const char *func,
- int line, int node_alloc, int node);
-extern void kmem_free_track(void *ptr, size_t size);
-extern void *vmem_alloc_track(size_t size, int flags, const char *func,
- int line);
-extern void vmem_free_track(void *ptr, size_t size);
-
-# define __kmem_alloc(size, flags, na, node) kmem_alloc_track((size), \
- (flags), __FUNCTION__, \
- __LINE__, (na), (node))
-# define kmem_free(ptr, size) kmem_free_track((ptr), (size))
-# define vmem_alloc(size, flags) vmem_alloc_track((size), \
- (flags),__FUNCTION__, \
- __LINE__)
-# define vmem_free(ptr, size) vmem_free_track((ptr), (size))
+/*
+ * DEBUG_KMEM && DEBUG_KMEM_TRACKING
+ *
+ * The maximum level of memory debugging. All memory will be accounted
+ * for and each allocation will be explicitly tracked. Any allocation
+ * which is leaked will be reported on module unload and the exact location
+ * where that memory was allocation will be reported. This level of memory
+ * tracking will have a significant impact on performance and should only
+ * be enabled for debugging. This feature may be enabled by passing
+ * --enable-debug-kmem-tracking to configure.
+ */
+# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
+ __FUNCTION__, __LINE__, 0, 0)
+# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
+ __FUNCTION__, __LINE__, 0, 0)
+# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
+ __FUNCTION__, __LINE__, 1, nd)
+# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
+
+# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
+ __FUNCTION__, __LINE__)
+# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
+ __FUNCTION__, __LINE__)
+# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
+
+extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
+extern void kmem_free_track(void *, size_t);
+extern void *vmem_alloc_track(size_t, int, const char *, int);
+extern void vmem_free_track(void *, size_t);
# else /* DEBUG_KMEM_TRACKING */
-
-extern void *kmem_alloc_debug(size_t size, int flags, const char *func,
- int line, int node_alloc, int node);
-extern void kmem_free_debug(void *ptr, size_t size);
-extern void *vmem_alloc_debug(size_t size, int flags, const char *func,
- int line);
-extern void vmem_free_debug(void *ptr, size_t size);
-
-# define __kmem_alloc(size, flags, na, node) kmem_alloc_debug((size), \
- (flags), __FUNCTION__, \
- __LINE__, (na), (node))
-# define kmem_free(ptr, size) kmem_free_debug((ptr), (size))
-# define vmem_alloc(size, flags) vmem_alloc_debug((size), \
- (flags), __FUNCTION__, \
- __LINE__)
-# define vmem_free(ptr, size) vmem_free_debug((ptr), (size))
+/*
+ * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ *
+ * The default build will set DEBUG_KEM. This provides basic memory
+ * accounting with little to no impact on performance. When the module
+ * is unloaded in any memory was leaked the total number of leaked bytes
+ * will be reported on the console. To disable this basic accounting
+ * pass the --disable-debug-kmem option to configure.
+ */
+# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
+ __FUNCTION__, __LINE__, 0, 0)
+# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
+ __FUNCTION__, __LINE__, 0, 0)
+# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
+ __FUNCTION__, __LINE__, 1, nd)
+# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
+
+# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
+ __FUNCTION__, __LINE__)
+# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
+ __FUNCTION__, __LINE__)
+# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
+
+extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
+extern void kmem_free_debug(void *, size_t);
+extern void *vmem_alloc_debug(size_t, int, const char *, int);
+extern void vmem_free_debug(void *, size_t);
# endif /* DEBUG_KMEM_TRACKING */
-
#else /* DEBUG_KMEM */
+/*
+ * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ *
+ * All debugging is disabled. There will be no overhead even for
+ * minimal memory accounting. To enable basic accounting pass the
+ * --enable-debug-kmem option to configure.
+ */
+# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
+# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
+# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
+# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
-# define kmem_alloc(size, flags) kmalloc_nofail((size), (flags))
-# define kmem_zalloc(size, flags) kzalloc_nofail((size), (flags))
-# define kmem_free(ptr, size) ((void)(size), kfree(ptr))
-
-# ifdef HAVE_KMALLOC_NODE
-# define kmem_alloc_node(size, flags, node) \
- kmalloc_node_nofail((size), (flags), (node))
-# else
-# define kmem_alloc_node(size, flags, node) \
- kmalloc_nofail((size), (flags))
-# endif
-
-# define vmem_alloc(size, flags) __vmalloc((size), ((flags) | \
- __GFP_HIGHMEM), PAGE_KERNEL)
-# define vmem_zalloc(size, flags) \
-({ \
- void *_ptr_ = __vmalloc((size),((flags)|__GFP_HIGHMEM),PAGE_KERNEL); \
- if (_ptr_) \
- memset(_ptr_, 0, (size)); \
- _ptr_; \
-})
-# define vmem_free(ptr, size) ((void)(size), vfree(ptr))
+# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
+# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
+# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
#endif /* DEBUG_KMEM */
+extern int kmem_debugging(void);
+extern char *kmem_vasprintf(const char *fmt, va_list ap);
+extern char *kmem_asprintf(const char *fmt, ...);
+extern char *strdup(const char *str);
+extern void strfree(char *str);
+
+
/*
- * Slab allocation interfaces
+ * Slab allocation interfaces. The SPL slab differs from the standard
+ * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
+ * allocated from the physical or virtal memory address space. The virtual
+ * slabs allow for good behavior when allocation large objects of identical
+ * size. This slab implementation also supports both constructors and
+ * destructions which the Linux slab does not.
*/
enum {
KMC_BIT_NOTOUCH = 0, /* Don't update ages */
@@ -246,12 +310,6 @@ enum {
#define KMC_REAP_CHUNK INT_MAX
#define KMC_DEFAULT_SEEKS 1
-extern int kmem_debugging(void);
-extern char *kmem_vasprintf(const char *fmt, va_list ap);
-extern char *kmem_asprintf(const char *fmt, ...);
-#define strfree(str) kfree(str)
-#define strdup(str) kstrdup(str, GFP_KERNEL)
-
extern struct list_head spl_kmem_cache_list;
extern struct rw_semaphore spl_kmem_cache_sem;
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index e575b1ee9..ec1ccb4ce 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -271,6 +271,34 @@ kmem_asprintf(const char *fmt, ...)
}
EXPORT_SYMBOL(kmem_asprintf);
+static char *
+__strdup(const char *str, int flags)
+{
+ char *ptr;
+ int n;
+
+ n = strlen(str);
+ ptr = kmalloc_nofail(n + 1, flags);
+ if (ptr)
+ memcpy(ptr, str, n + 1);
+
+ return ptr;
+}
+
+char *
+strdup(const char *str)
+{
+ return __strdup(str, KM_SLEEP);
+}
+EXPORT_SYMBOL(strdup);
+
+void
+strfree(char *str)
+{
+ kmem_free(str, strlen(str) + 1);
+}
+EXPORT_SYMBOL(strfree);
+
/*
* Memory allocation interfaces and debugging for basic kmem_*
* and vmem_* style memory allocation. When DEBUG_KMEM is enabled
@@ -285,12 +313,12 @@ atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
unsigned long long kmem_alloc_max = 0;
atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
unsigned long long vmem_alloc_max = 0;
-# else
+# else /* HAVE_ATOMIC64_T */
atomic_t kmem_alloc_used = ATOMIC_INIT(0);
unsigned long long kmem_alloc_max = 0;
atomic_t vmem_alloc_used = ATOMIC_INIT(0);
unsigned long long vmem_alloc_max = 0;
-# endif /* _LP64 */
+# endif /* HAVE_ATOMIC64_T */
EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
@@ -340,77 +368,9 @@ EXPORT_SYMBOL(kmem_list);
EXPORT_SYMBOL(vmem_lock);
EXPORT_SYMBOL(vmem_table);
EXPORT_SYMBOL(vmem_list);
-# endif
-#endif
-
-/*
- * Slab allocation interfaces
- *
- * While the Linux slab implementation was inspired by the Solaris
- * implemenation I cannot use it to emulate the Solaris APIs. I
- * require two features which are not provided by the Linux slab.
- *
- * 1) Constructors AND destructors. Recent versions of the Linux
- * kernel have removed support for destructors. This is a deal
- * breaker for the SPL which contains particularly expensive
- * initializers for mutex's, condition variables, etc. We also
- * require a minimal level of cleanup for these data types unlike
- * many Linux data type which do need to be explicitly destroyed.
- *
- * 2) Virtual address space backed slab. Callers of the Solaris slab
- * expect it to work well for both small are very large allocations.
- * Because of memory fragmentation the Linux slab which is backed
- * by kmalloc'ed memory performs very badly when confronted with
- * large numbers of large allocations. Basing the slab on the
- * virtual address space removes the need for contigeous pages
- * and greatly improve performance for large allocations.
- *
- * For these reasons, the SPL has its own slab implementation with
- * the needed features. It is not as highly optimized as either the
- * Solaris or Linux slabs, but it should get me most of what is
- * needed until it can be optimized or obsoleted by another approach.
- *
- * One serious concern I do have about this method is the relatively
- * small virtual address space on 32bit arches. This will seriously
- * constrain the size of the slab caches and their performance.
- *
- * XXX: Improve the partial slab list by carefully maintaining a
- * strict ordering of fullest to emptiest slabs based on
- * the slab reference count. This gaurentees the when freeing
- * slabs back to the system we need only linearly traverse the
- * last N slabs in the list to discover all the freeable slabs.
- *
- * XXX: NUMA awareness for optionally allocating memory close to a
- * particular core. This can be adventageous if you know the slab
- * object will be short lived and primarily accessed from one core.
- *
- * XXX: Slab coloring may also yield performance improvements and would
- * be desirable to implement.
- */
-
-struct list_head spl_kmem_cache_list; /* List of caches */
-struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
-
-static int spl_cache_flush(spl_kmem_cache_t *skc,
- spl_kmem_magazine_t *skm, int flush);
-
-#ifdef HAVE_SET_SHRINKER
-static struct shrinker *spl_kmem_cache_shrinker;
-#else
-static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
- unsigned int gfp_mask);
-static struct shrinker spl_kmem_cache_shrinker = {
- .shrink = spl_kmem_cache_generic_shrinker,
- .seeks = KMC_DEFAULT_SEEKS,
-};
-#endif
-
-#ifdef DEBUG_KMEM
-# ifdef DEBUG_KMEM_TRACKING
static kmem_debug_t *
-kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
- void *addr)
+kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, void *addr)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -444,17 +404,20 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line,
unsigned long irq_flags;
SENTRY;
+ /* Function may be called with KM_NOSLEEP so failure is possible */
dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
flags & ~__GFP_ZERO);
- if (dptr == NULL) {
+ if (unlikely(dptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
"kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
sizeof(kmem_debug_t), flags, func, line,
kmem_alloc_used_read(), kmem_alloc_max);
} else {
- /* Marked unlikely because we should never be doing this,
- * we tolerate to up 2 pages but a single page is best. */
+ /*
+ * Marked unlikely because we should never be doing this,
+ * we tolerate to up 2 pages but a single page is best.
+ */
if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
"kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
@@ -463,14 +426,17 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line,
spl_debug_dumpstack(NULL);
}
- /* We use kstrdup() below because the string pointed to by
+ /*
+ * We use __strdup() below because the string pointed to by
* __FUNCTION__ might not be available by the time we want
- * to print it since the module might have been unloaded. */
- dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
+ * to print it since the module might have been unloaded.
+ * This can only fail in the KM_NOSLEEP case.
+ */
+ dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
if (unlikely(dptr->kd_func == NULL)) {
kfree(dptr);
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
- "debug kstrdup() at %s:%d failed (%lld/%llu)\n",
+ "debug __strdup() at %s:%d failed (%lld/%llu)\n",
func, line, kmem_alloc_used_read(), kmem_alloc_max);
goto out;
}
@@ -533,7 +499,8 @@ kmem_free_track(void *ptr, size_t size)
dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
- ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
+ /* Must exist in hash due to kmem_alloc() */
+ ASSERT(dptr);
/* Size must match */
ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
@@ -567,28 +534,37 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line)
ASSERT(flags & KM_SLEEP);
+ /* Function may be called with KM_NOSLEEP so failure is possible */
dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
flags & ~__GFP_ZERO);
- if (dptr == NULL) {
+ if (unlikely(dptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
"vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
sizeof(kmem_debug_t), flags, func, line,
vmem_alloc_used_read(), vmem_alloc_max);
} else {
- /* We use kstrdup() below because the string pointed to by
+ /*
+ * We use __strdup() below because the string pointed to by
* __FUNCTION__ might not be available by the time we want
- * to print it, since the module might have been unloaded. */
- dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
+ * to print it, since the module might have been unloaded.
+ * This can never fail because we have already asserted
+ * that flags is KM_SLEEP.
+ */
+ dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
if (unlikely(dptr->kd_func == NULL)) {
kfree(dptr);
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
- "debug kstrdup() at %s:%d failed (%lld/%llu)\n",
+ "debug __strdup() at %s:%d failed (%lld/%llu)\n",
func, line, vmem_alloc_used_read(), vmem_alloc_max);
goto out;
}
- ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
- PAGE_KERNEL);
+ /* Use the correct allocator */
+ if (flags & __GFP_ZERO) {
+ ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
+ } else {
+ ptr = vmalloc_nofail(size, flags);
+ }
if (unlikely(ptr == NULL)) {
kfree(dptr->kd_func);
@@ -600,9 +576,6 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line)
goto out;
}
- if (flags & __GFP_ZERO)
- memset(ptr, 0, size);
-
vmem_alloc_used_add(size);
if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
vmem_alloc_max = vmem_alloc_used_read();
@@ -640,7 +613,9 @@ vmem_free_track(void *ptr, size_t size)
(unsigned long long) size);
dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
- ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
+
+ /* Must exist in hash due to vmem_alloc() */
+ ASSERT(dptr);
/* Size must match */
ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
@@ -673,11 +648,13 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
void *ptr;
SENTRY;
- /* Marked unlikely because we should never be doing this,
- * we tolerate to up 2 pages but a single page is best. */
+ /*
+ * Marked unlikely because we should never be doing this,
+ * we tolerate to up 2 pages but a single page is best.
+ */
if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
SDEBUG(SD_CONSOLE | SD_WARNING,
- "Large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
+ "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
(unsigned long long) size, flags, func, line,
kmem_alloc_used_read(), kmem_alloc_max);
spl_debug_dumpstack(NULL);
@@ -693,7 +670,7 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
ptr = kmalloc_nofail(size, flags);
}
- if (ptr == NULL) {
+ if (unlikely(ptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
"kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
(unsigned long long) size, flags, func, line,
@@ -706,8 +683,9 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
SDEBUG_LIMIT(SD_INFO,
"kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
(unsigned long long) size, flags, func, line, ptr,
- kmem_alloc_used_read(), kmem_alloc_max);
+ kmem_alloc_used_read(), kmem_alloc_max);
}
+
SRETURN(ptr);
}
EXPORT_SYMBOL(kmem_alloc_debug);
@@ -724,8 +702,6 @@ kmem_free_debug(void *ptr, size_t size)
SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
(unsigned long long) size, kmem_alloc_used_read(),
kmem_alloc_max);
-
- memset(ptr, 0x5a, size);
kfree(ptr);
SEXIT;
@@ -740,17 +716,19 @@ vmem_alloc_debug(size_t size, int flags, const char *func, int line)
ASSERT(flags & KM_SLEEP);
- ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
- PAGE_KERNEL);
- if (ptr == NULL) {
+ /* Use the correct allocator */
+ if (flags & __GFP_ZERO) {
+ ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
+ } else {
+ ptr = vmalloc_nofail(size, flags);
+ }
+
+ if (unlikely(ptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
"vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
(unsigned long long) size, flags, func, line,
vmem_alloc_used_read(), vmem_alloc_max);
} else {
- if (flags & __GFP_ZERO)
- memset(ptr, 0, size);
-
vmem_alloc_used_add(size);
if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
vmem_alloc_max = vmem_alloc_used_read();
@@ -776,8 +754,6 @@ vmem_free_debug(void *ptr, size_t size)
SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
(unsigned long long) size, vmem_alloc_used_read(),
vmem_alloc_max);
-
- memset(ptr, 0x5a, size);
vfree(ptr);
SEXIT;
@@ -787,6 +763,68 @@ EXPORT_SYMBOL(vmem_free_debug);
# endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implemenation I cannot use it to emulate the Solaris APIs. I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors. Recent versions of the Linux
+ * kernel have removed support for destructors. This is a deal
+ * breaker for the SPL which contains particularly expensive
+ * initializers for mutex's, condition variables, etc. We also
+ * require a minimal level of cleanup for these data types unlike
+ * many Linux data type which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab. Callers of the Solaris slab
+ * expect it to work well for both small are very large allocations.
+ * Because of memory fragmentation the Linux slab which is backed
+ * by kmalloc'ed memory performs very badly when confronted with
+ * large numbers of large allocations. Basing the slab on the
+ * virtual address space removes the need for contigeous pages
+ * and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features. It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches. This will seriously
+ * constrain the size of the slab caches and their performance.
+ *
+ * XXX: Improve the partial slab list by carefully maintaining a
+ * strict ordering of fullest to emptiest slabs based on
+ * the slab reference count. This gaurentees the when freeing
+ * slabs back to the system we need only linearly traverse the
+ * last N slabs in the list to discover all the freeable slabs.
+ *
+ * XXX: NUMA awareness for optionally allocating memory close to a
+ * particular core. This can be adventageous if you know the slab
+ * object will be short lived and primarily accessed from one core.
+ *
+ * XXX: Slab coloring may also yield performance improvements and would
+ * be desirable to implement.
+ */
+
+struct list_head spl_kmem_cache_list; /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+
+static int spl_cache_flush(spl_kmem_cache_t *skc,
+ spl_kmem_magazine_t *skm, int flush);
+
+#ifdef HAVE_SET_SHRINKER
+static struct shrinker *spl_kmem_cache_shrinker;
+#else
+static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
+ unsigned int gfp_mask);
+static struct shrinker spl_kmem_cache_shrinker = {
+ .shrink = spl_kmem_cache_generic_shrinker,
+ .seeks = KMC_DEFAULT_SEEKS,
+};
+#endif
+
static void *
kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
{