summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/sys/kmem.h159
-rw-r--r--include/sys/kmem_cache.h24
-rw-r--r--include/sys/vmem.h171
3 files changed, 95 insertions, 259 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index a9d94c909..045d07c2c 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -26,6 +26,7 @@
#define _SPL_KMEM_H
#include <linux/slab.h>
+#include <linux/sched.h>
extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap);
@@ -36,68 +37,41 @@ extern void strfree(char *str);
/*
* Memory allocation interfaces
*/
-#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */
-#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */
-#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */
-#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */
-#define KM_FLAGS __GFP_BITS_MASK
-#define KM_VMFLAGS GFP_LEVEL_MASK
+#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */
+#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */
+#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */
+#define KM_ZERO 0x1000 /* zero the allocation */
+#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-#define __GFP_ZERO 0x8000
-#endif
+#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
/*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32. To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
*/
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
-{
- void *ptr;
-
- do {
- ptr = kmalloc(size, flags);
- } while (ptr == NULL && (flags & __GFP_WAIT));
-
- return (ptr);
-}
-
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
{
- void *ptr;
-
- do {
- ptr = kzalloc(size, flags);
- } while (ptr == NULL && (flags & __GFP_WAIT));
+ gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
- return (ptr);
-}
+ if (flags & KM_NOSLEEP) {
+ lflags |= GFP_ATOMIC | __GFP_NORETRY;
+ } else {
+ lflags |= GFP_KERNEL;
+ if ((current->flags & PF_FSTRANS))
+ lflags &= ~(__GFP_IO|__GFP_FS);
+ }
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
-{
- void *ptr;
+ if (flags & KM_PUSHPAGE)
+ lflags |= __GFP_HIGH;
- do {
- ptr = kmalloc_node(size, flags, node);
- } while (ptr == NULL && (flags & __GFP_WAIT));
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
- return (ptr);
+ return (lflags);
}
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
#ifdef HAVE_ATOMIC64_T
#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
@@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
#endif /* HAVE_ATOMIC64_T */
-#ifdef DEBUG_KMEM_TRACKING
-/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
- *
- * The maximum level of memory debugging. All memory will be accounted
- * for and each allocation will be explicitly tracked. Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported. This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging. This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
- __FUNCTION__, __LINE__, 1, nd)
-#define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * The default build will set DEBUG_KEM. This provides basic memory
- * accounting with little to no impact on performance. When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console. To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
- __FUNCTION__, __LINE__, 1, nd)
-#define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
-extern void kmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * All debugging is disabled. There will be no overhead even for
- * minimal memory accounting. To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
- */
-#define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
-#define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
-#define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
-#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
-#endif /* DEBUG_KMEM */
+#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))
-int spl_kmem_init(void);
-void spl_kmem_fini(void);
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
-#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
- ((ptr) < (void *)VMALLOC_END))
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
#endif /* _SPL_KMEM_H */
diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h
index a5bc0322b..a9b5bdd2f 100644
--- a/include/sys/kmem_cache.h
+++ b/include/sys/kmem_cache.h
@@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
extern void spl_kmem_reap(void);
@@ -214,29 +215,6 @@ extern void spl_kmem_reap(void);
#define kmem_cache_reap_now(skc) \
spl_kmem_cache_reap_now(skc, skc->skc_reap)
#define kmem_reap() spl_kmem_reap()
-#define kmem_virt(ptr) \
- (((ptr) >= (void *)VMALLOC_START) && \
- ((ptr) < (void *)VMALLOC_END))
-
-/*
- * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
- * One use for this function is to ensure the __GFP_COMP flag is part of
- * the default allocation mask which ensures higher order allocations are
- * properly refcounted. This flag was added to the default ->allocflags
- * as of Linux 3.11.
- */
-static inline void
-kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
-{
- if (skc->skc_linux_cache == NULL)
- return;
-
-#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
- skc->skc_linux_cache->allocflags |= flags;
-#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
- skc->skc_linux_cache->gfpflags |= flags;
-#endif
-}
/*
* The following functions are only available for internal use.
diff --git a/include/sys/vmem.h b/include/sys/vmem.h
index f59ac5e8b..6eb2c6769 100644
--- a/include/sys/vmem.h
+++ b/include/sys/vmem.h
@@ -47,135 +47,60 @@ extern size_t vmem_size(vmem_t *vmp, int typemask);
#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
#endif
-static inline void *
-vmalloc_nofail(size_t size, gfp_t flags)
-{
- void *ptr;
-
- /*
- * Retry failed __vmalloc() allocations once every second. The
- * rational for the delay is that the likely failure modes are:
- *
- * 1) The system has completely exhausted memory, in which case
- * delaying 1 second for the memory reclaim to run is reasonable
- * to avoid thrashing the system.
- * 2) The system has memory but has exhausted the small virtual
- * address space available on 32-bit systems. Retrying the
- * allocation immediately will only result in spinning on the
- * virtual address space lock. It is better delay a second and
- * hope that another process will free some of the address space.
- * But the bottom line is there is not much we can actually do
- * since we can never safely return a failure and honor the
- * Solaris semantics.
- */
- while (1) {
- ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
- if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ);
- } else {
- break;
- }
- }
-
- return (ptr);
-}
-
-static inline void *
-vzalloc_nofail(size_t size, gfp_t flags)
-{
- void *ptr;
-
- ptr = vmalloc_nofail(size, flags);
- if (ptr)
- memset(ptr, 0, (size));
-
- return (ptr);
-}
-
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
-#ifdef HAVE_ATOMIC64_T
-
-#define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used)
-#define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used)
-#define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
-#define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
-
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#else /* HAVE_ATOMIC64_T */
-
-#define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used)
-#define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used)
-#define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
-#define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
-
-extern atomic_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#endif /* HAVE_ATOMIC64_T */
-
-#ifdef DEBUG_KMEM_TRACKING
/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
*
- * The maximum level of memory debugging. All memory will be accounted
- * for and each allocation will be explicitly tracked. Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported. This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging. This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
- __FUNCTION__, __LINE__)
-#define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__)
-#define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-extern void *vmem_alloc_track(size_t, int, const char *, int);
-extern void vmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
*
- * The default build will set DEBUG_KEM. This provides basic memory
- * accounting with little to no impact on performance. When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console. To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
- __FUNCTION__, __LINE__)
-#define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__)
-#define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
-
-extern void *vmem_alloc_debug(size_t, int, const char *, int);
-extern void vmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc(). It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ * vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ * only a few 100MB are available. The kernel will handle it by spinning
+ * when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ * lock which serializes all allocations.
*
- * All debugging is disabled. There will be no overhead even for
- * minimal memory accounting. To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ * list. The former will sum the allocations while the latter will print
+ * them to user space in a way that user space can keep the lock held
+ * indefinitely. When the total number of mapped allocations is large
+ * (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ * memory is used, it simply does not work on virtual memory. Certain
+ * Linux structures (e.g. the superblock) use them and might be embedded
+ * into a structure from Illumos. This makes using Linux virtual memory
+ * unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
*/
-#define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
-#define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
-#define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
-#endif /* DEBUG_KMEM */
+#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz))
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
int spl_vmem_init(void);
void spl_vmem_fini(void);