diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/sys/kmem.h | 159 | ||||
-rw-r--r-- | include/sys/kmem_cache.h | 24 | ||||
-rw-r--r-- | include/sys/vmem.h | 171 |
3 files changed, 95 insertions, 259 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h index a9d94c909..045d07c2c 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -26,6 +26,7 @@ #define _SPL_KMEM_H #include <linux/slab.h> +#include <linux/sched.h> extern int kmem_debugging(void); extern char *kmem_vasprintf(const char *fmt, va_list ap); @@ -36,68 +37,41 @@ extern void strfree(char *str); /* * Memory allocation interfaces */ -#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ -#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ -#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ -#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ -#define KM_FLAGS __GFP_BITS_MASK -#define KM_VMFLAGS GFP_LEVEL_MASK +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_ZERO 0x1000 /* zero the allocation */ +#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ -/* - * Used internally, the kernel does not need to support this flag - */ -#ifndef __GFP_ZERO -#define __GFP_ZERO 0x8000 -#endif +#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) /* - * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as - * early as 2.6.32. To avoid this issue when it occurs in upstream kernels - * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC). - * I would prefer the caller handle the failure case cleanly but we are - * trying to emulate Solaris and those are not the Solaris semantics. + * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion + * function is context aware which means that KM_SLEEP allocations can be + * safely used in syncing contexts which have set PF_FSTRANS. */ -static inline void * -kmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - do { - ptr = kmalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); - - return (ptr); -} - -static inline void * -kzalloc_nofail(size_t size, gfp_t flags) +static inline gfp_t +kmem_flags_convert(int flags) { - void *ptr; - - do { - ptr = kzalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); + gfp_t lflags = __GFP_NOWARN | __GFP_COMP; - return (ptr); -} + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC | __GFP_NORETRY; + } else { + lflags |= GFP_KERNEL; + if ((current->flags & PF_FSTRANS)) + lflags &= ~(__GFP_IO|__GFP_FS); + } -static inline void * -kmalloc_node_nofail(size_t size, gfp_t flags, int node) -{ - void *ptr; + if (flags & KM_PUSHPAGE) + lflags |= __GFP_HIGH; - do { - ptr = kmalloc_node(size, flags, node); - } while (ptr == NULL && (flags & __GFP_WAIT)); + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; - return (ptr); + return (lflags); } -#ifdef DEBUG_KMEM - -/* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ #ifdef HAVE_ATOMIC64_T #define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) @@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; #endif /* HAVE_ATOMIC64_T */ -#ifdef DEBUG_KMEM_TRACKING -/* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING - * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. - */ -#define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -#define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) - -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); - -#else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -#define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -#define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) - -extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); -extern void kmem_free_debug(const void *, size_t); - -#endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. - */ -#define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) -#define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) -#define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) -#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) +extern unsigned int spl_kmem_alloc_warn; +extern unsigned int spl_kmem_alloc_max; -#endif /* DEBUG_KMEM */ +#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) +#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) +#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) -int spl_kmem_init(void); -void spl_kmem_fini(void); +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_kmem_free(const void *ptr, size_t sz); -#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) +/* + * The following functions are only available for internal use. + */ +extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); +extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); +extern void *spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node); +extern void spl_kmem_free_impl(const void *buf, size_t size); +extern void spl_kmem_free_debug(const void *buf, size_t size); +extern void spl_kmem_free_track(const void *buf, size_t size); + +extern int spl_kmem_init(void); +extern void spl_kmem_fini(void); #endif /* _SPL_KMEM_H */ diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h index a5bc0322b..a9b5bdd2f 100644 --- a/include/sys/kmem_cache.h +++ b/include/sys/kmem_cache.h @@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); extern void spl_kmem_reap(void); @@ -214,29 +215,6 @@ extern void spl_kmem_reap(void); #define kmem_cache_reap_now(skc) \ spl_kmem_cache_reap_now(skc, skc->skc_reap) #define kmem_reap() spl_kmem_reap() -#define kmem_virt(ptr) \ - (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) - -/* - * Allow custom slab allocation flags to be set for KMC_SLAB based caches. - * One use for this function is to ensure the __GFP_COMP flag is part of - * the default allocation mask which ensures higher order allocations are - * properly refcounted. This flag was added to the default ->allocflags - * as of Linux 3.11. - */ -static inline void -kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) -{ - if (skc->skc_linux_cache == NULL) - return; - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= flags; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= flags; -#endif -} /* * The following functions are only available for internal use. diff --git a/include/sys/vmem.h b/include/sys/vmem.h index f59ac5e8b..6eb2c6769 100644 --- a/include/sys/vmem.h +++ b/include/sys/vmem.h @@ -47,135 +47,60 @@ extern size_t vmem_size(vmem_t *vmp, int typemask); #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) #endif -static inline void * -vmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - /* - * Retry failed __vmalloc() allocations once every second. The - * rational for the delay is that the likely failure modes are: - * - * 1) The system has completely exhausted memory, in which case - * delaying 1 second for the memory reclaim to run is reasonable - * to avoid thrashing the system. - * 2) The system has memory but has exhausted the small virtual - * address space available on 32-bit systems. Retrying the - * allocation immediately will only result in spinning on the - * virtual address space lock. It is better delay a second and - * hope that another process will free some of the address space. - * But the bottom line is there is not much we can actually do - * since we can never safely return a failure and honor the - * Solaris semantics. - */ - while (1) { - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); - if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } else { - break; - } - } - - return (ptr); -} - -static inline void * -vzalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - ptr = vmalloc_nofail(size, flags); - if (ptr) - memset(ptr, 0, (size)); - - return (ptr); -} - -#ifdef DEBUG_KMEM - -/* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ -#ifdef HAVE_ATOMIC64_T - -#define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) -#define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) -#define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) -#define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) - -extern atomic64_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -#else /* HAVE_ATOMIC64_T */ - -#define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) -#define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) -#define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) -#define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) - -extern atomic_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -#endif /* HAVE_ATOMIC64_T */ - -#ifdef DEBUG_KMEM_TRACKING /* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING + * vmem_* is an interface to a low level arena-based memory allocator on + * Illumos that is used to allocate virtual address space. The kmem SLAB + * allocator allocates slabs from it. Then the generic allocation functions + * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators. * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. - */ -#define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__) -#define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -#define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) - -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); -extern void *vmem_alloc_track(size_t, int, const char *, int); -extern void vmem_free_track(const void *, size_t); - -#else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * On Linux, the primary means of doing allocations is via kmalloc(), which + * is similarly layered on top of something called the buddy allocator. The + * buddy allocator is not available to kernel modules, it uses physical + * memory addresses rather than virtual memory addresses and is prone to + * fragmentation. * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -#define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__) -#define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -#define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) - -extern void *vmem_alloc_debug(size_t, int, const char *, int); -extern void vmem_free_debug(const void *, size_t); - -#endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * Linux sets aside a relatively small address space for in-kernel virtual + * memory from which allocations can be done using vmalloc(). It might seem + * like a good idea to use vmalloc() to implement something similar to + * Illumos' allocator. However, this has the following problems: + * + * 1. Page directory table allocations are hard coded to use GFP_KERNEL. + * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using + * vmalloc() will not have proper semantics. + * + * 2. Address space exhaustion is a real issue on 32-bit platforms where + * only a few 100MB are available. The kernel will handle it by spinning + * when it runs out of address space. + * + * 3. All vmalloc() allocations and frees are protected by a single global + * lock which serializes all allocations. * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. + * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire + * list. The former will sum the allocations while the latter will print + * them to user space in a way that user space can keep the lock held + * indefinitely. When the total number of mapped allocations is large + * (several 100,000) a large amount of time will be spent waiting on locks. + * + * 5. Linux has a wait_on_bit() locking primitive that assumes physical + * memory is used, it simply does not work on virtual memory. Certain + * Linux structures (e.g. the superblock) use them and might be embedded + * into a structure from Illumos. This makes using Linux virtual memory + * unsafe in certain situations. + * + * It follows that we cannot obtain identical semantics to those on Illumos. + * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in + * such a way that they can be used as drop-in replacements for small vmem_* + * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}() + * to them. */ -#define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) -#define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) -#define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) -#endif /* DEBUG_KMEM */ +#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) +#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) +#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) + +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); void spl_vmem_fini(void); |