Refactor generic memory allocation interfaces

This patch achieves the following goals: 1. It replaces the preprocessor kmem flag to gfp flag mapping with proper translation logic. This eliminates the potential for surprises that were previously possible where kmem flags were mapped to gfp flags. 2. It maps vmem_alloc() allocations to kmem_alloc() for allocations sized less than or equal to the newly-added spl_kmem_alloc_max parameter. This ensures that small allocations will not contend on a single global lock, large allocations can still be handled, and potentially limited virtual address space will not be squandered. This behavior is entirely different than under Illumos due to different memory management strategies employed by the respective kernels. However, this functionally provides the semantics required. 3. The --disable-debug-kmem, --enable-debug-kmem (default), and --enable-debug-kmem-tracking allocators have been unified in to a single spl_kmem_alloc_impl() allocation function. This was done to simplify the code and make it more maintainable. 4. Improve portability by exposing an implementation of the memory allocations functions that can be safely used in the same way they are used on Illumos. Specifically, callers may safely use KM_SLEEP in contexts which perform filesystem IO. This allows us to eliminate an entire class of Linux specific changes which were previously required to avoid deadlocking the system. This change will be largely transparent to existing callers but there are a few caveats: 1. Because the headers were refactored and extraneous includes removed callers may find they need to explicitly add additional #includes. In particular, kmem_cache.h must now be explicitly includes to access the SPL's kmem cache implementation. This behavior is different from Illumos but it was done to avoid always masking the Linux slab functions when kmem.h is included. 2. Callers, like Lustre, which made assumptions about the definitions of KM_SLEEP, KM_NOSLEEP, and KM_PUSHPAGE will need to be updated. Other callers such as ZFS which did not will not require changes. 3. KM_PUSHPAGE is no longer overloaded to imply GFP_NOIO. It retains its original meaning of allowing allocations to access reserved memory. KM_PUSHPAGE callers can be converted back to KM_SLEEP. 4. The KM_NODEBUG flags has been retired and the default warning threshold increased to 32k. 5. The kmem_virt() functions has been removed. For callers which need to distinguish between a physical and virtual address use is_vmalloc_addr(). Signed-off-by: Brian Behlendorf <[email protected]>
author: Brian Behlendorf <[email protected]> 2014-12-08 15:37:14 -0500
committer: Brian Behlendorf <[email protected]> 2015-01-16 13:55:09 -0800
commit: c3eabc75b1ea41a12e3fec06db74a2995bda7514 (patch)
tree: 67443562b186dc0eff162ec2f4c17fc693cf3e0f /include
parent: b34b95635a99223b6bff5437fb389e9340dc7dcd (diff)
3 files changed, 95 insertions, 259 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index a9d94c909..045d07c2c 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -26,6 +26,7 @@
 #define	_SPL_KMEM_H
 
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 extern int kmem_debugging(void);
 extern char *kmem_vasprintf(const char *fmt, va_list ap);
@@ -36,68 +37,41 @@ extern void strfree(char *str);
 /*
  * Memory allocation interfaces
  */
-#define	KM_SLEEP	GFP_KERNEL	/* Can sleep, never fails */
-#define	KM_NOSLEEP	GFP_ATOMIC	/* Can not sleep, may fail */
-#define	KM_PUSHPAGE	(GFP_NOIO | __GFP_HIGH)	/* Use reserved memory */
-#define	KM_NODEBUG	__GFP_NOWARN	/* Suppress warnings */
-#define	KM_FLAGS	__GFP_BITS_MASK
-#define	KM_VMFLAGS	GFP_LEVEL_MASK
+#define	KM_SLEEP	0x0000	/* can block for memory; success guaranteed */
+#define	KM_NOSLEEP	0x0001	/* cannot block for memory; may fail */
+#define	KM_PUSHPAGE	0x0004	/* can block for memory; may use reserve */
+#define	KM_ZERO		0x1000	/* zero the allocation */
+#define	KM_VMEM		0x2000	/* caller is vmem_* wrapper */
 
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-#define	__GFP_ZERO	0x8000
-#endif
+#define	KM_PUBLIC_MASK	(KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
 
 /*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart.  The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
  */
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	do {
-		ptr = kmalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return (ptr);
-}
-
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
 {
-	void *ptr;
-
-	do {
-		ptr = kzalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
 
-	return (ptr);
-}
+	if (flags & KM_NOSLEEP) {
+		lflags |= GFP_ATOMIC | __GFP_NORETRY;
+	} else {
+		lflags |= GFP_KERNEL;
+		if ((current->flags & PF_FSTRANS))
+			lflags &= ~(__GFP_IO|__GFP_FS);
+	}
 
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
-{
-	void *ptr;
+	if (flags & KM_PUSHPAGE)
+		lflags |= __GFP_HIGH;
 
-	do {
-		ptr = kmalloc_node(size, flags, node);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
 
-	return (ptr);
+	return (lflags);
 }
 
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
 #ifdef HAVE_ATOMIC64_T
 #define	kmem_alloc_used_add(size)	atomic64_add(size, &kmem_alloc_used)
 #define	kmem_alloc_used_sub(size)	atomic64_sub(size, &kmem_alloc_used)
@@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used;
 extern unsigned long long kmem_alloc_max;
 #endif /* HAVE_ATOMIC64_T */
 
-#ifdef DEBUG_KMEM_TRACKING
-/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
- *
- * The maximum level of memory debugging.  All memory will be accounted
- * for and each allocation will be explicitly tracked.  Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported.  This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging.  This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define	kmem_alloc(sz, fl)		kmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_zalloc(sz, fl)		kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_alloc_node(sz, fl, nd)	kmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__, 1, nd)
-#define	kmem_free(ptr, sz)		kmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * The default build will set DEBUG_KEM.  This provides basic memory
- * accounting with little to no impact on performance.  When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console.  To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define	kmem_alloc(sz, fl)		kmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_zalloc(sz, fl)		kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_alloc_node(sz, fl, nd)	kmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__, 1, nd)
-#define	kmem_free(ptr, sz)		kmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
-extern void kmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * All debugging is disabled.  There will be no overhead even for
- * minimal memory accounting.  To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
- */
-#define	kmem_alloc(sz, fl)		kmalloc_nofail((sz), (fl))
-#define	kmem_zalloc(sz, fl)		kzalloc_nofail((sz), (fl))
-#define	kmem_alloc_node(sz, fl, nd)	kmalloc_node_nofail((sz), (fl), (nd))
-#define	kmem_free(ptr, sz)		((void)(sz), kfree(ptr))
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
 
-#endif /* DEBUG_KMEM */
+#define	kmem_alloc(sz, fl)	spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define	kmem_zalloc(sz, fl)	spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	kmem_free(ptr, sz)	spl_kmem_free((ptr), (sz))
 
-int spl_kmem_init(void);
-void spl_kmem_fini(void);
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
 
-#define	kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
-					((ptr) <  (void *)VMALLOC_END))
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
 
 #endif	/* _SPL_KMEM_H */
diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h
index a5bc0322b..a9b5bdd2f 100644
--- a/include/sys/kmem_cache.h
+++ b/include/sys/kmem_cache.h
@@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
 extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
 extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
 extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
 extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
 extern void spl_kmem_reap(void);
 
@@ -214,29 +215,6 @@ extern void spl_kmem_reap(void);
 #define	kmem_cache_reap_now(skc)	\
     spl_kmem_cache_reap_now(skc, skc->skc_reap)
 #define	kmem_reap()			spl_kmem_reap()
-#define	kmem_virt(ptr)			\
-    (((ptr) >= (void *)VMALLOC_START) && \
-    ((ptr) <  (void *)VMALLOC_END))
-
-/*
- * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
- * One use for this function is to ensure the __GFP_COMP flag is part of
- * the default allocation mask which ensures higher order allocations are
- * properly refcounted.  This flag was added to the default ->allocflags
- * as of Linux 3.11.
- */
-static inline void
-kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
-{
-	if (skc->skc_linux_cache == NULL)
-		return;
-
-#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
-	skc->skc_linux_cache->allocflags |= flags;
-#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
-	skc->skc_linux_cache->gfpflags |= flags;
-#endif
-}
 
 /*
  * The following functions are only available for internal use.
diff --git a/include/sys/vmem.h b/include/sys/vmem.h
index f59ac5e8b..6eb2c6769 100644
--- a/include/sys/vmem.h
+++ b/include/sys/vmem.h
@@ -47,135 +47,60 @@ extern size_t vmem_size(vmem_t *vmp, int typemask);
 #define	VMALLOC_TOTAL	(VMALLOC_END - VMALLOC_START)
 #endif
 
-static inline void *
-vmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	/*
-	 * Retry failed __vmalloc() allocations once every second.  The
-	 * rational for the delay is that the likely failure modes are:
-	 *
-	 * 1) The system has completely exhausted memory, in which case
-	 *    delaying 1 second for the memory reclaim to run is reasonable
-	 *    to avoid thrashing the system.
-	 * 2) The system has memory but has exhausted the small virtual
-	 *    address space available on 32-bit systems.  Retrying the
-	 *    allocation immediately will only result in spinning on the
-	 *    virtual address space lock.  It is better delay a second and
-	 *    hope that another process will free some of the address space.
-	 *    But the bottom line is there is not much we can actually do
-	 *    since we can never safely return a failure and honor the
-	 *    Solaris semantics.
-	 */
-	while (1) {
-		ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
-		if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-		} else {
-			break;
-		}
-	}
-
-	return (ptr);
-}
-
-static inline void *
-vzalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	ptr = vmalloc_nofail(size, flags);
-	if (ptr)
-		memset(ptr, 0, (size));
-
-	return (ptr);
-}
-
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
-#ifdef HAVE_ATOMIC64_T
-
-#define	vmem_alloc_used_add(size)	atomic64_add(size, &vmem_alloc_used)
-#define	vmem_alloc_used_sub(size)	atomic64_sub(size, &vmem_alloc_used)
-#define	vmem_alloc_used_read()		atomic64_read(&vmem_alloc_used)
-#define	vmem_alloc_used_set(size)	atomic64_set(&vmem_alloc_used, size)
-
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#else  /* HAVE_ATOMIC64_T */
-
-#define	vmem_alloc_used_add(size)	atomic_add(size, &vmem_alloc_used)
-#define	vmem_alloc_used_sub(size)	atomic_sub(size, &vmem_alloc_used)
-#define	vmem_alloc_used_read()		atomic_read(&vmem_alloc_used)
-#define	vmem_alloc_used_set(size)	atomic_set(&vmem_alloc_used, size)
-
-extern atomic_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#endif /* HAVE_ATOMIC64_T */
-
-#ifdef DEBUG_KMEM_TRACKING
 /*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
  *
- * The maximum level of memory debugging.  All memory will be accounted
- * for and each allocation will be explicitly tracked.  Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported.  This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging.  This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define	vmem_alloc(sz, fl)		vmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__)
-#define	vmem_zalloc(sz, fl)		vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__)
-#define	vmem_free(ptr, sz)		vmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-extern void *vmem_alloc_track(size_t, int, const char *, int);
-extern void vmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
  *
- * The default build will set DEBUG_KEM.  This provides basic memory
- * accounting with little to no impact on performance.  When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console.  To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define	vmem_alloc(sz, fl)		vmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__)
-#define	vmem_zalloc(sz, fl)		vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__)
-#define	vmem_free(ptr, sz)		vmem_free_debug((ptr), (sz))
-
-extern void *vmem_alloc_debug(size_t, int, const char *, int);
-extern void vmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc().  It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ *    Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ *    vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ *    only a few 100MB are available. The kernel will handle it by spinning
+ *    when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ *    lock which serializes all allocations.
  *
- * All debugging is disabled.  There will be no overhead even for
- * minimal memory accounting.  To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ *    list. The former will sum the allocations while the latter will print
+ *    them to user space in a way that user space can keep the lock held
+ *    indefinitely.  When the total number of mapped allocations is large
+ *    (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ *    memory is used, it simply does not work on virtual memory.  Certain
+ *    Linux structures (e.g. the superblock) use them and might be embedded
+ *    into a structure from Illumos.  This makes using Linux virtual memory
+ *    unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
  */
-#define	vmem_alloc(sz, fl)		vmalloc_nofail((sz), (fl))
-#define	vmem_zalloc(sz, fl)		vzalloc_nofail((sz), (fl))
-#define	vmem_free(ptr, sz)		((void)(sz), vfree(ptr))
 
-#endif /* DEBUG_KMEM */
+#define	vmem_alloc(sz, fl)	spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define	vmem_zalloc(sz, fl)	spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	vmem_free(ptr, sz)	spl_vmem_free((ptr), (sz))
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
 
 int spl_vmem_init(void);
 void spl_vmem_fini(void);
author	Brian Behlendorf <[email protected]>	2014-12-08 15:37:14 -0500
committer	Brian Behlendorf <[email protected]>	2015-01-16 13:55:09 -0800
commit	c3eabc75b1ea41a12e3fec06db74a2995bda7514 (patch)
tree	67443562b186dc0eff162ec2f4c17fc693cf3e0f /include
parent	b34b95635a99223b6bff5437fb389e9340dc7dcd (diff)