3 files changed, 95 insertions, 259 deletions
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index a9d94c909..045d07c2c 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -26,6 +26,7 @@
 #define	_SPL_KMEM_H
 
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 extern int kmem_debugging(void);
 extern char *kmem_vasprintf(const char *fmt, va_list ap);
@@ -36,68 +37,41 @@ extern void strfree(char *str);
 /*
  * Memory allocation interfaces
  */
-#define	KM_SLEEP	GFP_KERNEL	/* Can sleep, never fails */
-#define	KM_NOSLEEP	GFP_ATOMIC	/* Can not sleep, may fail */
-#define	KM_PUSHPAGE	(GFP_NOIO | __GFP_HIGH)	/* Use reserved memory */
-#define	KM_NODEBUG	__GFP_NOWARN	/* Suppress warnings */
-#define	KM_FLAGS	__GFP_BITS_MASK
-#define	KM_VMFLAGS	GFP_LEVEL_MASK
+#define	KM_SLEEP	0x0000	/* can block for memory; success guaranteed */
+#define	KM_NOSLEEP	0x0001	/* cannot block for memory; may fail */
+#define	KM_PUSHPAGE	0x0004	/* can block for memory; may use reserve */
+#define	KM_ZERO		0x1000	/* zero the allocation */
+#define	KM_VMEM		0x2000	/* caller is vmem_* wrapper */
 
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-#define	__GFP_ZERO	0x8000
-#endif
+#define	KM_PUBLIC_MASK	(KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
 
 /*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart.  The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
  */
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	do {
-		ptr = kmalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return (ptr);
-}
-
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
 {
-	void *ptr;
-
-	do {
-		ptr = kzalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
 
-	return (ptr);
-}
+	if (flags & KM_NOSLEEP) {
+		lflags |= GFP_ATOMIC | __GFP_NORETRY;
+	} else {
+		lflags |= GFP_KERNEL;
+		if ((current->flags & PF_FSTRANS))
+			lflags &= ~(__GFP_IO|__GFP_FS);
+	}
 
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
-{
-	void *ptr;
+	if (flags & KM_PUSHPAGE)
+		lflags |= __GFP_HIGH;
 
-	do {
-		ptr = kmalloc_node(size, flags, node);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
 
-	return (ptr);
+	return (lflags);
 }
 
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
 #ifdef HAVE_ATOMIC64_T
 #define	kmem_alloc_used_add(size)	atomic64_add(size, &kmem_alloc_used)
 #define	kmem_alloc_used_sub(size)	atomic64_sub(size, &kmem_alloc_used)
@@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used;
 extern unsigned long long kmem_alloc_max;
 #endif /* HAVE_ATOMIC64_T */
 
-#ifdef DEBUG_KMEM_TRACKING
-/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
- *
- * The maximum level of memory debugging.  All memory will be accounted
- * for and each allocation will be explicitly tracked.  Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported.  This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging.  This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define	kmem_alloc(sz, fl)		kmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_zalloc(sz, fl)		kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_alloc_node(sz, fl, nd)	kmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__, 1, nd)
-#define	kmem_free(ptr, sz)		kmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * The default build will set DEBUG_KEM.  This provides basic memory
- * accounting with little to no impact on performance.  When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console.  To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define	kmem_alloc(sz, fl)		kmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_zalloc(sz, fl)		kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_alloc_node(sz, fl, nd)	kmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__, 1, nd)
-#define	kmem_free(ptr, sz)		kmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
-extern void kmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * All debugging is disabled.  There will be no overhead even for
- * minimal memory accounting.  To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
- */
-#define	kmem_alloc(sz, fl)		kmalloc_nofail((sz), (fl))
-#define	kmem_zalloc(sz, fl)		kzalloc_nofail((sz), (fl))
-#define	kmem_alloc_node(sz, fl, nd)	kmalloc_node_nofail((sz), (fl), (nd))
-#define	kmem_free(ptr, sz)		((void)(sz), kfree(ptr))
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
 
-#endif /* DEBUG_KMEM */
+#define	kmem_alloc(sz, fl)	spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define	kmem_zalloc(sz, fl)	spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	kmem_free(ptr, sz)	spl_kmem_free((ptr), (sz))
 
-int spl_kmem_init(void);
-void spl_kmem_fini(void);
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
 
-#define	kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
-					((ptr) <  (void *)VMALLOC_END))
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
 
 #endif	/* _SPL_KMEM_H */
diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h
index a5bc0322b..a9b5bdd2f 100644
--- a/include/sys/kmem_cache.h
+++ b/include/sys/kmem_cache.h
@@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
 extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
 extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
 extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
 extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
 extern void spl_kmem_reap(void);
 
@@ -214,29 +215,6 @@ extern void spl_kmem_reap(void);
 #define	kmem_cache_reap_now(skc)	\
     spl_kmem_cache_reap_now(skc, skc->skc_reap)
 #define	kmem_reap()			spl_kmem_reap()
-#define	kmem_virt(ptr)			\
-    (((ptr) >= (void *)VMALLOC_START) && \
-    ((ptr) <  (void *)VMALLOC_END))
-
-/*
- * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
- * One use for this function is to ensure the __GFP_COMP flag is part of
- * the default allocation mask which ensures higher order allocations are
- * properly refcounted.  This flag was added to the default ->allocflags
- * as of Linux 3.11.
- */
-static inline void
-kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
-{
-	if (skc->skc_linux_cache == NULL)
-		return;
-
-#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
-	skc->skc_linux_cache->allocflags |= flags;
-#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
-	skc->skc_linux_cache->gfpflags |= flags;
-#endif
-}
 
 /*
  * The following functions are only available for internal use.
diff --git a/include/sys/vmem.h b/include/sys/vmem.h
index f59ac5e8b..6eb2c6769 100644
--- a/include/sys/vmem.h
+++ b/include/sys/vmem.h
@@ -47,135 +47,60 @@ extern size_t vmem_size(vmem_t *vmp, int typemask);
 #define	VMALLOC_TOTAL	(VMALLOC_END - VMALLOC_START)
 #endif
 
-static inline void *
-vmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	/*
-	 * Retry failed __vmalloc() allocations once every second.  The
-	 * rational for the delay is that the likely failure modes are:
-	 *
-	 * 1) The system has completely exhausted memory, in which case
-	 *    delaying 1 second for the memory reclaim to run is reasonable
-	 *    to avoid thrashing the system.
-	 * 2) The system has memory but has exhausted the small virtual
-	 *    address space available on 32-bit systems.  Retrying the
-	 *    allocation immediately will only result in spinning on the
-	 *    virtual address space lock.  It is better delay a second and
-	 *    hope that another process will free some of the address space.
-	 *    But the bottom line is there is not much we can actually do
-	 *    since we can never safely return a failure and honor the
-	 *    Solaris semantics.
-	 */
-	while (1) {
-		ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
-		if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-		} else {
-			break;
-		}
-	}
-
-	return (ptr);
-}
-
-static inline void *
-vzalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	ptr = vmalloc_nofail(size, flags);
-	if (ptr)
-		memset(ptr, 0, (size));
-
-	return (ptr);
-}
-
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
-#ifdef HAVE_ATOMIC64_T
-
-#define	vmem_alloc_used_add(size)	atomic64_add(size, &vmem_alloc_used)
-#define	vmem_alloc_used_sub(size)	atomic64_sub(size, &vmem_alloc_used)
-#define	vmem_alloc_used_read()		atomic64_read(&vmem_alloc_used)
-#define	vmem_alloc_used_set(size)	atomic64_set(&vmem_alloc_used, size)
-
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#else  /* HAVE_ATOMIC64_T */
-
-#define	vmem_alloc_used_add(size)	atomic_add(size, &vmem_alloc_used)
-#define	vmem_alloc_used_sub(size)	atomic_sub(size, &vmem_alloc_used)
-#define	vmem_alloc_used_read()		atomic_read(&vmem_alloc_used)
-#define	vmem_alloc_used_set(size)	atomic_set(&vmem_alloc_used, size)
-
-extern atomic_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#endif /* HAVE_ATOMIC64_T */
-
-#ifdef DEBUG_KMEM_TRACKING
 /*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
  *
- * The maximum level of memory debugging.  All memory will be accounted
- * for and each allocation will be explicitly tracked.  Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported.  This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging.  This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define	vmem_alloc(sz, fl)		vmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__)
-#define	vmem_zalloc(sz, fl)		vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__)
-#define	vmem_free(ptr, sz)		vmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-extern void *vmem_alloc_track(size_t, int, const char *, int);
-extern void vmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
  *
- * The default build will set DEBUG_KEM.  This provides basic memory
- * accounting with little to no impact on performance.  When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console.  To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define	vmem_alloc(sz, fl)		vmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__)
-#define	vmem_zalloc(sz, fl)		vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__)
-#define	vmem_free(ptr, sz)		vmem_free_debug((ptr), (sz))
-
-extern void *vmem_alloc_debug(size_t, int, const char *, int);
-extern void vmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc().  It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ *    Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ *    vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ *    only a few 100MB are available. The kernel will handle it by spinning
+ *    when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ *    lock which serializes all allocations.
  *
- * All debugging is disabled.  There will be no overhead even for
- * minimal memory accounting.  To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ *    list. The former will sum the allocations while the latter will print
+ *    them to user space in a way that user space can keep the lock held
+ *    indefinitely.  When the total number of mapped allocations is large
+ *    (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ *    memory is used, it simply does not work on virtual memory.  Certain
+ *    Linux structures (e.g. the superblock) use them and might be embedded
+ *    into a structure from Illumos.  This makes using Linux virtual memory
+ *    unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
  */
-#define	vmem_alloc(sz, fl)		vmalloc_nofail((sz), (fl))
-#define	vmem_zalloc(sz, fl)		vzalloc_nofail((sz), (fl))
-#define	vmem_free(ptr, sz)		((void)(sz), vfree(ptr))
 
-#endif /* DEBUG_KMEM */
+#define	vmem_alloc(sz, fl)	spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define	vmem_zalloc(sz, fl)	spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	vmem_free(ptr, sz)	spl_vmem_free((ptr), (sz))
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
 
 int spl_vmem_init(void);
 void spl_vmem_fini(void);