From c3eabc75b1ea41a12e3fec06db74a2995bda7514 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 Dec 2014 15:37:14 -0500 Subject: Refactor generic memory allocation interfaces This patch achieves the following goals: 1. It replaces the preprocessor kmem flag to gfp flag mapping with proper translation logic. This eliminates the potential for surprises that were previously possible where kmem flags were mapped to gfp flags. 2. It maps vmem_alloc() allocations to kmem_alloc() for allocations sized less than or equal to the newly-added spl_kmem_alloc_max parameter. This ensures that small allocations will not contend on a single global lock, large allocations can still be handled, and potentially limited virtual address space will not be squandered. This behavior is entirely different than under Illumos due to different memory management strategies employed by the respective kernels. However, this functionally provides the semantics required. 3. The --disable-debug-kmem, --enable-debug-kmem (default), and --enable-debug-kmem-tracking allocators have been unified in to a single spl_kmem_alloc_impl() allocation function. This was done to simplify the code and make it more maintainable. 4. Improve portability by exposing an implementation of the memory allocations functions that can be safely used in the same way they are used on Illumos. Specifically, callers may safely use KM_SLEEP in contexts which perform filesystem IO. This allows us to eliminate an entire class of Linux specific changes which were previously required to avoid deadlocking the system. This change will be largely transparent to existing callers but there are a few caveats: 1. Because the headers were refactored and extraneous includes removed callers may find they need to explicitly add additional #includes. In particular, kmem_cache.h must now be explicitly includes to access the SPL's kmem cache implementation. This behavior is different from Illumos but it was done to avoid always masking the Linux slab functions when kmem.h is included. 2. Callers, like Lustre, which made assumptions about the definitions of KM_SLEEP, KM_NOSLEEP, and KM_PUSHPAGE will need to be updated. Other callers such as ZFS which did not will not require changes. 3. KM_PUSHPAGE is no longer overloaded to imply GFP_NOIO. It retains its original meaning of allowing allocations to access reserved memory. KM_PUSHPAGE callers can be converted back to KM_SLEEP. 4. The KM_NODEBUG flags has been retired and the default warning threshold increased to 32k. 5. The kmem_virt() functions has been removed. For callers which need to distinguish between a physical and virtual address use is_vmalloc_addr(). Signed-off-by: Brian Behlendorf --- module/spl/spl-kmem-cache.c | 89 +++++----- module/spl/spl-kmem.c | 399 +++++++++++++++++++++++++++----------------- module/spl/spl-proc.c | 20 --- module/spl/spl-tsd.c | 3 +- module/spl/spl-vmem.c | 319 ++++------------------------------- 5 files changed, 320 insertions(+), 510 deletions(-) (limited to 'module/spl') diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 3aa65a9bf..9a8ccfe42 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -130,19 +130,6 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. - * - * XXX: Improve the partial slab list by carefully maintaining a - * strict ordering of fullest to emptiest slabs based on - * the slab reference count. This guarantees that when freeing - * slabs back to the system we need only linearly traverse the - * last N slabs in the list to discover all the freeable slabs. - * - * XXX: NUMA awareness for optionally allocating memory close to a - * particular core. This can be advantageous if you know the slab - * object will be short lived and primarily accessed from one core. - * - * XXX: Slab coloring may also yield performance improvements and would - * be desirable to implement. */ struct list_head spl_kmem_cache_list; /* List of caches */ @@ -158,15 +145,15 @@ SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { + gfp_t lflags = kmem_flags_convert(flags); void *ptr; ASSERT(ISP2(size)); if (skc->skc_flags & KMC_KMEM) - ptr = (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); else - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -361,12 +348,11 @@ spl_slab_free(spl_kmem_slab_t *sks, } /* - * Traverse all the partial slabs attached to a cache and free those - * which which are currently empty, and have not been touched for - * skc_delay seconds to avoid thrashing. The count argument is - * passed to optionally cap the number of slabs reclaimed, a count - * of zero means try and reclaim everything. When flag is set we - * always free an available slab regardless of age. + * Traverse all the partial slabs attached to a cache and free those which + * are currently empty, and have not been touched for skc_delay seconds to + * avoid thrashing. The count argument is passed to optionally cap the + * number of slabs reclaimed, a count of zero means try and reclaim + * everything. When flag the is set available slabs freed regardless of age. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) @@ -480,6 +466,7 @@ spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { + gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int empty; @@ -490,11 +477,11 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); - ske = kmalloc(sizeof (*ske), flags); + ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); - ske->ske_obj = kmalloc(skc->skc_obj_size, flags); + ske->ske_obj = kmalloc(skc->skc_obj_size, lflags); if (ske->ske_obj == NULL) { kfree(ske); return (-ENOMEM); @@ -734,7 +721,7 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; - skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; @@ -754,13 +741,9 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) static void spl_magazine_free(spl_kmem_magazine_t *skm) { - int size = sizeof (spl_kmem_magazine_t) + - sizeof (void *) * skm->skm_size; - ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); - - kmem_free(skm, size); + kfree(skm); } /* @@ -835,6 +818,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags) { + gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; @@ -852,18 +836,17 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, * Allocate memory for a new cache and initialize it. Unfortunately, * this usually ends up being a large allocation of ~32k because * we need to allocate enough memory for the worst case number of - * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we - * explicitly pass KM_NODEBUG to suppress the kmem warning + * cpus in the magazine, skc_mag[NR_CPUS]. */ - skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG); + skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { - kmem_free(skc, sizeof (*skc)); + kfree(skc); return (NULL); } strncpy(skc->skc_name, name, skc->skc_name_size); @@ -962,7 +945,11 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, goto out; } - kmem_cache_set_allocflags(skc, __GFP_COMP); +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif skc->skc_flags |= KMC_NOMAGAZINE; } @@ -977,8 +964,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, return (skc); out: - kmem_free(skc->skc_name, skc->skc_name_size); - kmem_free(skc, sizeof (*skc)); + kfree(skc->skc_name); + kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); @@ -1048,10 +1035,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); - kmem_free(skc->skc_name, skc->skc_name_size); spin_unlock(&skc->skc_lock); - kmem_free(skc, sizeof (*skc)); + kfree(skc->skc_name); + kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); @@ -1106,7 +1093,13 @@ spl_cache_grow_work(void *data) spl_kmem_cache_t *skc = ska->ska_cache; spl_kmem_slab_t *sks; - sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); +#if defined(PF_MEMALLOC_NOIO) + unsigned noio_flag = memalloc_noio_save(); + sks = spl_slab_alloc(skc, ska->ska_flags); + memalloc_noio_restore(noio_flag); +#else + sks = spl_slab_alloc(skc, ska->ska_flags); +#endif spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; @@ -1140,8 +1133,9 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc) static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { - int remaining, rc; + int remaining, rc = 0; + ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); @@ -1166,7 +1160,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; - ska = kmalloc(sizeof (*ska), flags); + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit(KMC_BIT_GROWING, &skc->skc_flags); wake_up_all(&skc->skc_waitq); @@ -1175,7 +1169,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) atomic_inc(&skc->skc_ref); ska->ska_cache = skc; - ska->ska_flags = flags & ~__GFP_FS; + ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); @@ -1347,9 +1341,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) spl_kmem_magazine_t *skm; void *obj = NULL; + ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - ASSERT(flags & KM_SLEEP); atomic_inc(&skc->skc_ref); @@ -1360,9 +1354,8 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; - do { - obj = kmem_cache_alloc(slc, flags | __GFP_COMP); + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); goto ret; @@ -1445,7 +1438,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) * are guaranteed to have physical addresses. They must be removed * from the tree of emergency objects and the freed. */ - if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) { + if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) { spl_emergency_free(skc, obj); goto out; } diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 96ad2b043..4cd7cdbee 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -23,8 +23,47 @@ */ #include +#include #include #include +#include +#include + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to eight pages but capped at 32K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +unsigned int spl_kmem_alloc_warn = MAX(8 * PAGE_SIZE, 32 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); int kmem_debugging(void) @@ -72,7 +111,7 @@ __strdup(const char *str, int flags) int n; n = strlen(str); - ptr = kmalloc_nofail(n + 1, flags); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); if (ptr) memcpy(ptr, str, n + 1); @@ -94,10 +133,101 @@ strfree(char *str) EXPORT_SYMBOL(strfree); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Limit the number of large allocation stack traces dumped to not more than + * 5 every 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if (unlikely(size > spl_kmem_alloc_max)) { + if (flags & KM_VMEM) { + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) { + printk(KERN_WARNING + "Possible memory allocation deadlock: " + "size=%lu lflags=0x%x", + (unsigned long)size, lflags); + dump_stack(); + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem */ #ifdef DEBUG_KMEM @@ -113,6 +243,28 @@ unsigned long long kmem_alloc_max = 0; EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + /* * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is @@ -124,9 +276,14 @@ EXPORT_SYMBOL(kmem_alloc_max); * contended particularly on xfree(). If we want to run with this detailed * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking */ #ifdef DEBUG_KMEM_TRACKING +#include +#include + #define KMEM_HASH_BITS 10 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) @@ -139,13 +296,9 @@ typedef struct kmem_debug { int kd_line; /* Allocation line */ } kmem_debug_t; -spinlock_t kmem_lock; -struct hlist_head kmem_table[KMEM_TABLE_SIZE]; -struct list_head kmem_list; - -EXPORT_SYMBOL(kmem_lock); -EXPORT_SYMBOL(kmem_table); -EXPORT_SYMBOL(kmem_list); +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; static kmem_debug_t * kmem_del_init(spinlock_t *lock, struct hlist_head *table, @@ -174,176 +327,112 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, return (NULL); } -void * -kmem_alloc_track(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) { void *ptr = NULL; kmem_debug_t *dptr; unsigned long irq_flags; - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), - flags & ~__GFP_ZERO); + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d " - "failed (%lld/%llu)\n", sizeof (kmem_debug_t), flags, - func, line, kmem_alloc_used_read(), kmem_alloc_max); - } else { - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } - - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it since the module might have been unloaded. - * This can only fail in the KM_NOSLEEP case. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = kmalloc_nofail(size, flags); - } + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); - spin_lock_irqsave(&kmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &kmem_list); - spin_unlock_irqrestore(&kmem_lock, irq_flags); - } -out: return (ptr); } -EXPORT_SYMBOL(kmem_alloc_track); -void -kmem_free_track(const void *ptr, size_t size) +inline void +spl_kmem_free_track(const void *ptr, size_t size) { kmem_debug_t *dptr; - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - /* Must exist in hash due to kmem_alloc() */ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); - ASSERT(dptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - kmem_alloc_used_sub(size); kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); kfree(dptr); - memset((void *)ptr, 0x5a, size); - kfree(ptr); + spl_kmem_free_debug(ptr, size); } -EXPORT_SYMBOL(kmem_free_track); - -#else /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ void * -kmem_alloc_debug(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +spl_kmem_alloc(size_t size, int flags, const char *func, int line) { - void *ptr; - - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING - "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = kmalloc_nofail(size, flags); - } +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - } else { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); - } + flags |= KM_ZERO; - return (ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(kmem_alloc_debug); +EXPORT_SYMBOL(spl_kmem_zalloc); void -kmem_free_debug(const void *ptr, size_t size) +spl_kmem_free(const void *buf, size_t size) { - ASSERT(ptr || size > 0); - kmem_alloc_used_sub(size); - kfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif } -EXPORT_SYMBOL(kmem_free_debug); - -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ +EXPORT_SYMBOL(spl_kmem_free); #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) static char * @@ -424,22 +513,20 @@ spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) spin_unlock_irqrestore(lock, flags); } -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ int spl_kmem_init(void) { - int rc = 0; - #ifdef DEBUG_KMEM kmem_alloc_used_set(0); + +#ifdef DEBUG_KMEM_TRACKING spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); -#endif +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ - return (rc); + return (0); } void @@ -454,8 +541,10 @@ spl_kmem_fini(void) */ if (kmem_alloc_used_read() != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - kmem_alloc_used_read(), kmem_alloc_max); + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); +#ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ } diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index e5712aee0..a434ef54f 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -353,26 +353,6 @@ static struct ctl_table spl_kmem_table[] = { .mode = 0444, .proc_handler = &proc_doulongvec_minmax, }, - { - .procname = "vmem_used", - .data = &vmem_alloc_used, -# ifdef HAVE_ATOMIC64_T - .maxlen = sizeof(atomic64_t), -# else - .maxlen = sizeof(atomic_t), -# endif /* HAVE_ATOMIC64_T */ - .mode = 0444, - .proc_handler = &proc_domemused, - }, - { - .procname = "vmem_max", - .data = &vmem_alloc_max, - .maxlen = sizeof(unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, - }, { .procname = "slab_kmem_total", .data = (void *)(KMC_KMEM | KMC_TOTAL), diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c index f4f81048c..9a0987527 100644 --- a/module/spl/spl-tsd.c +++ b/module/spl/spl-tsd.c @@ -337,8 +337,7 @@ tsd_hash_table_init(uint_t bits) if (table == NULL) return (NULL); - table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, - KM_SLEEP | KM_NODEBUG); + table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP); if (table->ht_bins == NULL) { kmem_free(table, sizeof(tsd_hash_table_t)); return (NULL); diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c index 51aef941b..e177988a7 100644 --- a/module/spl/spl-vmem.c +++ b/module/spl/spl-vmem.c @@ -24,6 +24,7 @@ #include #include +#include #include vmem_t *heap_arena = NULL; @@ -47,314 +48,62 @@ vmem_size(vmem_t *vmp, int typemask) EXPORT_SYMBOL(vmem_size); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. */ -#ifdef DEBUG_KMEM - -/* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T -atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long vmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t vmem_alloc_used = ATOMIC_INIT(0); -unsigned long long vmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ - -EXPORT_SYMBOL(vmem_alloc_used); -EXPORT_SYMBOL(vmem_alloc_max); - -/* - * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked - * but also the location of every alloc and free. When the SPL module is - * unloaded a list of all leaked addresses and where they were allocated - * will be dumped to the console. Enabling this feature has a significant - * impact on performance but it makes finding memory leaks straight forward. - * - * Not surprisingly with debugging enabled the xmem_locks are very highly - * contended particularly on xfree(). If we want to run with this detailed - * debugging enabled for anything other than debugging we need to minimize - * the contention by moving to a lock per xmem_table entry model. - */ -#ifdef DEBUG_KMEM_TRACKING - -#define VMEM_HASH_BITS 10 -#define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) - -typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ -} kmem_debug_t; - -spinlock_t vmem_lock; -struct hlist_head vmem_table[VMEM_TABLE_SIZE]; -struct list_head vmem_list; - -EXPORT_SYMBOL(vmem_lock); -EXPORT_SYMBOL(vmem_table); -EXPORT_SYMBOL(vmem_list); - void * -vmem_alloc_track(size_t size, int flags, const char *func, int line) -{ - void *ptr = NULL; - kmem_debug_t *dptr; - unsigned long irq_flags; - - ASSERT(flags & KM_SLEEP); - - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), - flags & ~__GFP_ZERO); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - sizeof (kmem_debug_t), flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - } else { - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it, since the module might have been unloaded. - * This can never fail because we have already asserted - * that flags is KM_SLEEP. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&vmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &vmem_list); - spin_unlock_irqrestore(&vmem_lock, irq_flags); - } -out: - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_track); - -void -vmem_free_track(const void *ptr, size_t size) +spl_vmem_alloc(size_t size, int flags, const char *func, int line) { - kmem_debug_t *dptr; + ASSERT0(flags & ~KM_PUBLIC_MASK); - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); + flags |= KM_VMEM; - /* Must exist in hash due to vmem_alloc() */ - dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); - ASSERT(dptr); - - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - vmem_alloc_used_sub(size); - kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); - kfree(dptr); - - memset((void *)ptr, 0x5a, size); - vfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(vmem_free_track); - -#else /* DEBUG_KMEM_TRACKING */ +EXPORT_SYMBOL(spl_vmem_alloc); void * -vmem_alloc_debug(size_t size, int flags, const char *func, int line) +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) { - void *ptr; - - ASSERT(flags & KM_SLEEP); + ASSERT0(flags & ~KM_PUBLIC_MASK); - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = vmalloc_nofail(size, flags); - } + flags |= (KM_VMEM | KM_ZERO); - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max); - } else { - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - } - - return (ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(vmem_alloc_debug); +EXPORT_SYMBOL(spl_vmem_zalloc); void -vmem_free_debug(const void *ptr, size_t size) -{ - ASSERT(ptr || size > 0); - vmem_alloc_used_sub(size); - vfree(ptr); -} -EXPORT_SYMBOL(vmem_free_debug); - -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ - -#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) -static char * -spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) -{ - int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; - int i, flag = 1; - - ASSERT(str != NULL && len >= 17); - memset(str, 0, len); - - /* - * Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. - */ - for (i = 0; i < size; i++) { - str[i] = ((char *)(kd->kd_addr))[i]; - if (isprint(str[i])) { - continue; - } else { - /* - * Minimum number of printable characters found - * to make it worthwhile to print this as ascii. - */ - if (i > min) - break; - - flag = 0; - break; - } - } - - if (!flag) { - sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); - } - - return (str); -} - -static int -spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) -{ - int i; - - spin_lock_init(lock); - INIT_LIST_HEAD(list); - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&kmem_table[i]); - - return (0); -} - -static void -spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +spl_vmem_free(const void *buf, size_t size) { - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - spin_lock_irqsave(lock, flags); - if (!list_empty(list)) - printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); - - list_for_each_entry(kd, list, kd_list) - printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(lock, flags); +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif } -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) -#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ +EXPORT_SYMBOL(spl_vmem_free); int spl_vmem_init(void) { - int rc = 0; - -#ifdef DEBUG_KMEM - vmem_alloc_used_set(0); - spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); -#endif - - return (rc); + return (0); } void spl_vmem_fini(void) { -#ifdef DEBUG_KMEM - /* - * Display all unreclaimed memory addresses, including the - * allocation size and the first few bytes of what's located - * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. - */ - if (vmem_alloc_used_read() != 0) - printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", - vmem_alloc_used_read(), vmem_alloc_max); - - spl_kmem_fini_tracking(&vmem_list, &vmem_lock); -#endif /* DEBUG_KMEM */ } -- cgit v1.2.3