1 files changed, 244 insertions, 155 deletions
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index 96ad2b043..4cd7cdbee 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -23,8 +23,47 @@
  */
 
 #include <sys/debug.h>
+#include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/vmem.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous.  Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to eight pages but capped at 32K to
+ * accommodate systems using large pages.  This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern.  Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught.  These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+unsigned int spl_kmem_alloc_warn = MAX(8 * PAGE_SIZE, 32 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+	"Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages.  Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
+ * will quickly fail.  Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+	"Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
 
 int
 kmem_debugging(void)
@@ -72,7 +111,7 @@ __strdup(const char *str, int flags)
 	int n;
 
 	n = strlen(str);
-	ptr = kmalloc_nofail(n + 1, flags);
+	ptr = kmalloc(n + 1, kmem_flags_convert(flags));
 	if (ptr)
 		memcpy(ptr, str, n + 1);
 
@@ -94,10 +133,101 @@ strfree(char *str)
 EXPORT_SYMBOL(strfree);
 
 /*
- * Memory allocation interfaces and debugging for basic kmem_*
- * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
- * the SPL will keep track of the total memory allocated, and
- * report any memory leaked when the module is unloaded.
+ * Limit the number of large allocation stack traces dumped to not more than
+ * 5 every 60 seconds to prevent denial-of-service attacks from debug code.
+ */
+DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5);
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable.  Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	void *ptr;
+
+	/*
+	 * Log abnormally large allocations and rate limit the console output.
+	 * Allocations larger than spl_kmem_alloc_warn should be performed
+	 * through the vmem_alloc()/vmem_zalloc() interfaces.
+	 */
+	if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+	    !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) {
+		printk(KERN_WARNING
+		    "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+		    "https://github.com/zfsonlinux/zfs/issues/new\n",
+		    (unsigned long)size, flags);
+		dump_stack();
+	}
+
+	/*
+	 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+	 * unlike kmem_alloc() with KM_SLEEP on Illumos.
+	 */
+	do {
+		/*
+		 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+		 * is unsafe.  This must fail for all for kmem_alloc() and
+		 * kmem_zalloc() callers.
+		 *
+		 * For vmem_alloc() and vmem_zalloc() callers it is permissible
+		 * to use __vmalloc().  However, in general use of __vmalloc()
+		 * is strongly discouraged because a global lock must be
+		 * acquired.  Contention on this lock can significantly
+		 * impact performance so frequently manipulating the virtual
+		 * address space is strongly discouraged.
+		 */
+		if (unlikely(size > spl_kmem_alloc_max)) {
+			if (flags & KM_VMEM) {
+				ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+			} else {
+				return (NULL);
+			}
+		} else {
+			ptr = kmalloc_node(size, lflags, node);
+		}
+
+		if (likely(ptr) || (flags & KM_NOSLEEP))
+			return (ptr);
+
+		if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) {
+			printk(KERN_WARNING
+			    "Possible memory allocation deadlock: "
+			    "size=%lu lflags=0x%x",
+			    (unsigned long)size, lflags);
+			dump_stack();
+		}
+
+		/*
+		 * Use cond_resched() instead of congestion_wait() to avoid
+		 * deadlocking systems where there are no block devices.
+		 */
+		cond_resched();
+	} while (1);
+
+	return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+	if (is_vmalloc_addr(buf))
+		vfree(buf);
+	else
+		kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations.  When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
  */
 #ifdef DEBUG_KMEM
 
@@ -113,6 +243,28 @@ unsigned long long kmem_alloc_max = 0;
 EXPORT_SYMBOL(kmem_alloc_used);
 EXPORT_SYMBOL(kmem_alloc_max);
 
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+	void *ptr;
+
+	ptr = spl_kmem_alloc_impl(size, flags, node);
+	if (ptr) {
+		kmem_alloc_used_add(size);
+		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+			kmem_alloc_max = kmem_alloc_used_read();
+	}
+
+	return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+	kmem_alloc_used_sub(size);
+	spl_kmem_free_impl(ptr, size);
+}
+
 /*
  * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
  * but also the location of every alloc and free.  When the SPL module is
@@ -124,9 +276,14 @@ EXPORT_SYMBOL(kmem_alloc_max);
  * contended particularly on xfree().  If we want to run with this detailed
  * debugging enabled for anything other than debugging  we need to minimize
  * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
  */
 #ifdef DEBUG_KMEM_TRACKING
 
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
 #define	KMEM_HASH_BITS		10
 #define	KMEM_TABLE_SIZE		(1 << KMEM_HASH_BITS)
 
@@ -139,13 +296,9 @@ typedef struct kmem_debug {
 	int kd_line;			/* Allocation line */
 } kmem_debug_t;
 
-spinlock_t kmem_lock;
-struct hlist_head kmem_table[KMEM_TABLE_SIZE];
-struct list_head kmem_list;
-
-EXPORT_SYMBOL(kmem_lock);
-EXPORT_SYMBOL(kmem_table);
-EXPORT_SYMBOL(kmem_list);
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
 
 static kmem_debug_t *
 kmem_del_init(spinlock_t *lock, struct hlist_head *table,
@@ -174,176 +327,112 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table,
 	return (NULL);
 }
 
-void *
-kmem_alloc_track(size_t size, int flags, const char *func, int line,
-    int node_alloc, int node)
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node)
 {
 	void *ptr = NULL;
 	kmem_debug_t *dptr;
 	unsigned long irq_flags;
 
-	/* Function may be called with KM_NOSLEEP so failure is possible */
-	dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t),
-	    flags & ~__GFP_ZERO);
+	dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+	if (dptr == NULL)
+		return (NULL);
 
-	if (unlikely(dptr == NULL)) {
-		printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d "
-		    "failed (%lld/%llu)\n", sizeof (kmem_debug_t), flags,
-		    func, line, kmem_alloc_used_read(), kmem_alloc_max);
-	} else {
-		/*
-		 * Marked unlikely because we should never be doing this,
-		 * we tolerate to up 2 pages but a single page is best.
-		 */
-		if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
-			printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) "
-			    "at %s:%d failed (%lld/%llu)\n",
-			    (unsigned long long)size, flags, func, line,
-			    kmem_alloc_used_read(), kmem_alloc_max);
-			spl_dumpstack();
-		}
-
-		/*
-		 *  We use __strdup() below because the string pointed to by
-		 * __FUNCTION__ might not be available by the time we want
-		 * to print it since the module might have been unloaded.
-		 * This can only fail in the KM_NOSLEEP case.
-		 */
-		dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
-		if (unlikely(dptr->kd_func == NULL)) {
-			kfree(dptr);
-			printk(KERN_WARNING "debug __strdup() at %s:%d "
-			    "failed (%lld/%llu)\n", func, line,
-			    kmem_alloc_used_read(), kmem_alloc_max);
-			goto out;
-		}
-
-		/* Use the correct allocator */
-		if (node_alloc) {
-			ASSERT(!(flags & __GFP_ZERO));
-			ptr = kmalloc_node_nofail(size, flags, node);
-		} else if (flags & __GFP_ZERO) {
-			ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
-		} else {
-			ptr = kmalloc_nofail(size, flags);
-		}
+	dptr->kd_func = __strdup(func, flags);
+	if (dptr->kd_func == NULL) {
+		kfree(dptr);
+		return (NULL);
+	}
 
-		if (unlikely(ptr == NULL)) {
-			kfree(dptr->kd_func);
-			kfree(dptr);
-			printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) "
-			    "at %s:%d failed (%lld/%llu)\n",
-			    (unsigned long long) size, flags, func, line,
-			    kmem_alloc_used_read(), kmem_alloc_max);
-			goto out;
-		}
+	ptr = spl_kmem_alloc_debug(size, flags, node);
+	if (ptr == NULL) {
+		kfree(dptr->kd_func);
+		kfree(dptr);
+		return (NULL);
+	}
 
-		kmem_alloc_used_add(size);
-		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
-			kmem_alloc_max = kmem_alloc_used_read();
+	INIT_HLIST_NODE(&dptr->kd_hlist);
+	INIT_LIST_HEAD(&dptr->kd_list);
 
-		INIT_HLIST_NODE(&dptr->kd_hlist);
-		INIT_LIST_HEAD(&dptr->kd_list);
+	dptr->kd_addr = ptr;
+	dptr->kd_size = size;
+	dptr->kd_line = line;
 
-		dptr->kd_addr = ptr;
-		dptr->kd_size = size;
-		dptr->kd_line = line;
+	spin_lock_irqsave(&kmem_lock, irq_flags);
+	hlist_add_head(&dptr->kd_hlist,
+	    &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+	list_add_tail(&dptr->kd_list, &kmem_list);
+	spin_unlock_irqrestore(&kmem_lock, irq_flags);
 
-		spin_lock_irqsave(&kmem_lock, irq_flags);
-		hlist_add_head(&dptr->kd_hlist,
-		    &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
-		list_add_tail(&dptr->kd_list, &kmem_list);
-		spin_unlock_irqrestore(&kmem_lock, irq_flags);
-	}
-out:
 	return (ptr);
 }
-EXPORT_SYMBOL(kmem_alloc_track);
 
-void
-kmem_free_track(const void *ptr, size_t size)
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
 {
 	kmem_debug_t *dptr;
 
-	ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
-	    (unsigned long long) size);
-
 	/* Must exist in hash due to kmem_alloc() */
 	dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
-	ASSERT(dptr);
+	ASSERT3P(dptr, !=, NULL);
+	ASSERT3S(dptr->kd_size, ==, size);
 
-	/* Size must match */
-	ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
-	    "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
-	    (unsigned long long) size, dptr->kd_func, dptr->kd_line);
-
-	kmem_alloc_used_sub(size);
 	kfree(dptr->kd_func);
-
-	memset((void *)dptr, 0x5a, sizeof (kmem_debug_t));
 	kfree(dptr);
 
-	memset((void *)ptr, 0x5a, size);
-	kfree(ptr);
+	spl_kmem_free_debug(ptr, size);
 }
-EXPORT_SYMBOL(kmem_free_track);
-
-#else /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
 
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
 void *
-kmem_alloc_debug(size_t size, int flags, const char *func, int line,
-    int node_alloc, int node)
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
 {
-	void *ptr;
-
-	/*
-	 * Marked unlikely because we should never be doing this,
-	 * we tolerate to up 2 pages but a single page is best.
-	 */
-	if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
-		printk(KERN_WARNING
-		    "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
-		    (unsigned long long)size, flags, func, line,
-		    (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
-		spl_dumpstack();
-	}
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
 
-	/* Use the correct allocator */
-	if (node_alloc) {
-		ASSERT(!(flags & __GFP_ZERO));
-		ptr = kmalloc_node_nofail(size, flags, node);
-	} else if (flags & __GFP_ZERO) {
-		ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
-	} else {
-		ptr = kmalloc_nofail(size, flags);
-	}
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
 
-	if (unlikely(ptr == NULL)) {
-		printk(KERN_WARNING
-		    "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
-		    (unsigned long long)size, flags, func, line,
-		    (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
-	} else {
-		kmem_alloc_used_add(size);
-		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
-			kmem_alloc_max = kmem_alloc_used_read();
-	}
+	flags |= KM_ZERO;
 
-	return (ptr);
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
 }
-EXPORT_SYMBOL(kmem_alloc_debug);
+EXPORT_SYMBOL(spl_kmem_zalloc);
 
 void
-kmem_free_debug(const void *ptr, size_t size)
+spl_kmem_free(const void *buf, size_t size)
 {
-	ASSERT(ptr || size > 0);
-	kmem_alloc_used_sub(size);
-	kfree(ptr);
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_free_debug(buf, size));
+#else
+	return (spl_kmem_free_track(buf, size));
+#endif
 }
-EXPORT_SYMBOL(kmem_free_debug);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#endif /* DEBUG_KMEM */
+EXPORT_SYMBOL(spl_kmem_free);
 
 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
 static char *
@@ -424,22 +513,20 @@ spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
 
 	spin_unlock_irqrestore(lock, flags);
 }
-#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
-#define	spl_kmem_init_tracking(list, lock, size)
-#define	spl_kmem_fini_tracking(list, lock)
 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
 
 int
 spl_kmem_init(void)
 {
-	int rc = 0;
-
 #ifdef DEBUG_KMEM
 	kmem_alloc_used_set(0);
+
+#ifdef DEBUG_KMEM_TRACKING
 	spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
-#endif
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
 
-	return (rc);
+	return (0);
 }
 
 void
@@ -454,8 +541,10 @@ spl_kmem_fini(void)
 	 */
 	if (kmem_alloc_used_read() != 0)
 		printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
-		    kmem_alloc_used_read(), kmem_alloc_max);
+		    (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
 
+#ifdef DEBUG_KMEM_TRACKING
 	spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
 #endif /* DEBUG_KMEM */
 }