aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys/kmem.h
blob: e90c6b8ceb2ce01686c74767d5c0b302b6fc210a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
/*****************************************************************************\
 *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
 *  Copyright (C) 2007 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 *  UCRL-CODE-235197
 *
 *  This file is part of the SPL, Solaris Porting Layer.
 *  For details, see <http://github.com/behlendorf/spl/>.
 *
 *  The SPL is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2 of the License, or (at your
 *  option) any later version.
 *
 *  The SPL is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
\*****************************************************************************/

#ifndef _SPL_KMEM_H
#define	_SPL_KMEM_H

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm_compat.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/hash.h>
#include <linux/ctype.h>
#include <asm/atomic.h>
#include <sys/types.h>
#include <sys/vmsystm.h>
#include <sys/kstat.h>

/*
 * Memory allocation interfaces
 */
#define KM_SLEEP                        GFP_NOFS
#define KM_NOSLEEP                      GFP_ATOMIC
#undef  KM_PANIC                        /* No linux analog */
#define KM_PUSHPAGE                     (KM_SLEEP | __GFP_HIGH)
#define KM_VMFLAGS                      GFP_LEVEL_MASK
#define KM_FLAGS                        __GFP_BITS_MASK
#define KM_NODEBUG                      __GFP_NOWARN

/*
 * Used internally, the kernel does not need to support this flag
 */
#ifndef __GFP_ZERO
# define __GFP_ZERO                     0x8000
#endif

/*
 * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
 * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
 * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
 * I would prefer the caller handle the failure case cleanly but we are
 * trying to emulate Solaris and those are not the Solaris semantics.
 */
static inline void *
kmalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	do {
		ptr = kmalloc(size, flags);
	} while (ptr == NULL && (flags & __GFP_WAIT));

	return ptr;
}

static inline void *
kzalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	do {
		ptr = kzalloc(size, flags);
	} while (ptr == NULL && (flags & __GFP_WAIT));

	return ptr;
}

static inline void *
kmalloc_node_nofail(size_t size, gfp_t flags, int node)
{
#ifdef HAVE_KMALLOC_NODE
	void *ptr;

	do {
		ptr = kmalloc_node(size, flags, node);
	} while (ptr == NULL && (flags & __GFP_WAIT));

	return ptr;
#else
	return kmalloc_nofail(size, flags);
#endif /* HAVE_KMALLOC_NODE */
}

static inline void *
vmalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	/*
	 * Retry failed __vmalloc() allocations once every second.  The
	 * rational for the delay is that the likely failure modes are:
	 *
	 * 1) The system has completely exhausted memory, in which case
	 *    delaying 1 second for the memory reclaim to run is reasonable
	 *    to avoid thrashing the system.
	 * 2) The system has memory but has exhausted the small virtual
	 *    address space available on 32-bit systems.  Retrying the
	 *    allocation immediately will only result in spinning on the
	 *    virtual address space lock.  It is better delay a second and
	 *    hope that another process will free some of the address space.
	 *    But the bottom line is there is not much we can actually do
	 *    since we can never safely return a failure and honor the
	 *    Solaris semantics.
	 */
	while (1) {
		ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
		if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
			set_current_state(TASK_INTERRUPTIBLE);
			schedule_timeout(HZ);
		} else {
			break;
		}
	}

	return ptr;
}

static inline void *
vzalloc_nofail(size_t size, gfp_t flags)
{
	void *ptr;

	ptr = vmalloc_nofail(size, flags);
	if (ptr)
		memset(ptr, 0, (size));

	return ptr;
}

#ifdef DEBUG_KMEM

/*
 * Memory accounting functions to be used only when DEBUG_KMEM is set.
 */
# ifdef HAVE_ATOMIC64_T

# define kmem_alloc_used_add(size)      atomic64_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size)      atomic64_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read()         atomic64_read(&kmem_alloc_used)
# define kmem_alloc_used_set(size)      atomic64_set(&kmem_alloc_used, size)
# define vmem_alloc_used_add(size)      atomic64_add(size, &vmem_alloc_used)
# define vmem_alloc_used_sub(size)      atomic64_sub(size, &vmem_alloc_used)
# define vmem_alloc_used_read()         atomic64_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size)      atomic64_set(&vmem_alloc_used, size)

extern atomic64_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic64_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;

# else  /* HAVE_ATOMIC64_T */

# define kmem_alloc_used_add(size)      atomic_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size)      atomic_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read()         atomic_read(&kmem_alloc_used)
# define kmem_alloc_used_set(size)      atomic_set(&kmem_alloc_used, size)
# define vmem_alloc_used_add(size)      atomic_add(size, &vmem_alloc_used)
# define vmem_alloc_used_sub(size)      atomic_sub(size, &vmem_alloc_used)
# define vmem_alloc_used_read()         atomic_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size)      atomic_set(&vmem_alloc_used, size)

extern atomic_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;

# endif /* HAVE_ATOMIC64_T */

# ifdef DEBUG_KMEM_TRACKING
/*
 * DEBUG_KMEM && DEBUG_KMEM_TRACKING
 *
 * The maximum level of memory debugging.  All memory will be accounted
 * for and each allocation will be explicitly tracked.  Any allocation
 * which is leaked will be reported on module unload and the exact location
 * where that memory was allocation will be reported.  This level of memory
 * tracking will have a significant impact on performance and should only
 * be enabled for debugging.  This feature may be enabled by passing
 * --enable-debug-kmem-tracking to configure.
 */
#  define kmem_alloc(sz, fl)            kmem_alloc_track((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_zalloc(sz, fl)           kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_track((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 1, nd)
#  define kmem_free(ptr, sz)            kmem_free_track((ptr), (sz))

#  define vmem_alloc(sz, fl)            vmem_alloc_track((sz), (fl),           \
                                             __FUNCTION__, __LINE__)
#  define vmem_zalloc(sz, fl)           vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__)
#  define vmem_free(ptr, sz)            vmem_free_track((ptr), (sz))

extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
extern void kmem_free_track(void *, size_t);
extern void *vmem_alloc_track(size_t, int, const char *, int);
extern void vmem_free_track(void *, size_t);

# else /* DEBUG_KMEM_TRACKING */
/*
 * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
 *
 * The default build will set DEBUG_KEM.  This provides basic memory
 * accounting with little to no impact on performance.  When the module
 * is unloaded in any memory was leaked the total number of leaked bytes
 * will be reported on the console.  To disable this basic accounting
 * pass the --disable-debug-kmem option to configure.
 */
#  define kmem_alloc(sz, fl)            kmem_alloc_debug((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_zalloc(sz, fl)           kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__, 0, 0)
#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_debug((sz), (fl),           \
                                             __FUNCTION__, __LINE__, 1, nd)
#  define kmem_free(ptr, sz)            kmem_free_debug((ptr), (sz))

#  define vmem_alloc(sz, fl)            vmem_alloc_debug((sz), (fl),           \
                                             __FUNCTION__, __LINE__)
#  define vmem_zalloc(sz, fl)           vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
                                             __FUNCTION__, __LINE__)
#  define vmem_free(ptr, sz)            vmem_free_debug((ptr), (sz))

extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
extern void kmem_free_debug(void *, size_t);
extern void *vmem_alloc_debug(size_t, int, const char *, int);
extern void vmem_free_debug(void *, size_t);

# endif /* DEBUG_KMEM_TRACKING */
#else /* DEBUG_KMEM */
/*
 * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
 *
 * All debugging is disabled.  There will be no overhead even for
 * minimal memory accounting.  To enable basic accounting pass the
 * --enable-debug-kmem option to configure.
 */
# define kmem_alloc(sz, fl)             kmalloc_nofail((sz), (fl))
# define kmem_zalloc(sz, fl)            kzalloc_nofail((sz), (fl))
# define kmem_alloc_node(sz, fl, nd)    kmalloc_node_nofail((sz), (fl), (nd))
# define kmem_free(ptr, sz)             ((void)(sz), kfree(ptr))

# define vmem_alloc(sz, fl)             vmalloc_nofail((sz), (fl))
# define vmem_zalloc(sz, fl)            vzalloc_nofail((sz), (fl))
# define vmem_free(ptr, sz)             ((void)(sz), vfree(ptr))

#endif /* DEBUG_KMEM */

extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap);
extern char *kmem_asprintf(const char *fmt, ...);
extern char *strdup(const char *str);
extern void strfree(char *str);


/*
 * Slab allocation interfaces.  The SPL slab differs from the standard
 * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
 * allocated from the physical or virtal memory address space.  The virtual
 * slabs allow for good behavior when allocation large objects of identical
 * size.  This slab implementation also supports both constructors and
 * destructions which the Linux slab does not.
 */
enum {
	KMC_BIT_NOTOUCH		= 0,	/* Don't update ages */
	KMC_BIT_NODEBUG		= 1,	/* Default behavior */
	KMC_BIT_NOMAGAZINE	= 2,	/* XXX: Unsupported */
	KMC_BIT_NOHASH		= 3,	/* XXX: Unsupported */
	KMC_BIT_QCACHE		= 4,	/* XXX: Unsupported */
	KMC_BIT_KMEM		= 5,	/* Use kmem cache */
	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
	KMC_BIT_OFFSLAB		= 7,	/* Objects not on slab */
	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
};

#define KMC_NOTOUCH		(1 << KMC_BIT_NOTOUCH)
#define KMC_NODEBUG		(1 << KMC_BIT_NODEBUG)
#define KMC_NOMAGAZINE		(1 << KMC_BIT_NOMAGAZINE)
#define KMC_NOHASH		(1 << KMC_BIT_NOHASH)
#define KMC_QCACHE		(1 << KMC_BIT_QCACHE)
#define KMC_KMEM		(1 << KMC_BIT_KMEM)
#define KMC_VMEM		(1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
#define KMC_REAPING		(1 << KMC_BIT_REAPING)
#define KMC_DESTROY		(1 << KMC_BIT_DESTROY)

#define KMC_REAP_CHUNK			INT_MAX
#define KMC_DEFAULT_SEEKS		1

extern struct list_head spl_kmem_cache_list;
extern struct rw_semaphore spl_kmem_cache_sem;

#define SKM_MAGIC			0x2e2e2e2e
#define SKO_MAGIC			0x20202020
#define SKS_MAGIC			0x22222222
#define SKC_MAGIC			0x2c2c2c2c

#define SPL_KMEM_CACHE_DELAY		15	/* Minimum slab release age */
#define SPL_KMEM_CACHE_REAP		0	/* Default reap everything */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB	32	/* Target objects per slab */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN	8	/* Minimum objects per slab */
#define SPL_KMEM_CACHE_ALIGN		8	/* Default object alignment */

typedef int (*spl_kmem_ctor_t)(void *, void *, int);
typedef void (*spl_kmem_dtor_t)(void *, void *);
typedef void (*spl_kmem_reclaim_t)(void *);

typedef struct spl_kmem_magazine {
	uint32_t		skm_magic;	/* Sanity magic */
	uint32_t		skm_avail;	/* Available objects */
	uint32_t		skm_size;	/* Magazine size */
	uint32_t		skm_refill;	/* Batch refill size */
	struct spl_kmem_cache	*skm_cache;	/* Owned by cache */
	struct delayed_work	skm_work;	/* Magazine reclaim work */
	unsigned long		skm_age;	/* Last cache access */
	void			*skm_objs[0];	/* Object pointers */
} spl_kmem_magazine_t;

typedef struct spl_kmem_obj {
        uint32_t		sko_magic;	/* Sanity magic */
	void			*sko_addr;	/* Buffer address */
	struct spl_kmem_slab	*sko_slab;	/* Owned by slab */
	struct list_head	sko_list;	/* Free object list linkage */
} spl_kmem_obj_t;

typedef struct spl_kmem_slab {
        uint32_t		sks_magic;	/* Sanity magic */
	uint32_t		sks_objs;	/* Objects per slab */
	struct spl_kmem_cache	*sks_cache;	/* Owned by cache */
	struct list_head	sks_list;	/* Slab list linkage */
	struct list_head	sks_free_list;	/* Free object list */
	unsigned long		sks_age;	/* Last modify jiffie */
	uint32_t		sks_ref;	/* Ref count used objects */
} spl_kmem_slab_t;

typedef struct spl_kmem_cache {
	uint32_t		skc_magic;	/* Sanity magic */
	uint32_t		skc_name_size;	/* Name length */
	char			*skc_name;	/* Name string */
	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */
	uint32_t		skc_mag_size;	/* Magazine size */
	uint32_t		skc_mag_refill;	/* Magazine refill count */
	spl_kmem_ctor_t		skc_ctor;	/* Constructor */
	spl_kmem_dtor_t		skc_dtor;	/* Destructor */
	spl_kmem_reclaim_t	skc_reclaim;	/* Reclaimator */
	void			*skc_private;	/* Private data */
	void			*skc_vmp;	/* Unused */
	unsigned long		skc_flags;	/* Flags */
	uint32_t		skc_obj_size;	/* Object size */
	uint32_t		skc_obj_align;	/* Object alignment */
	uint32_t		skc_slab_objs;	/* Objects per slab */
	uint32_t		skc_slab_size;	/* Slab size */
	uint32_t		skc_delay;	/* Slab reclaim interval */
	uint32_t		skc_reap;	/* Slab reclaim count */
	atomic_t		skc_ref;	/* Ref count callers */
	struct delayed_work	skc_work;	/* Slab reclaim work */
	struct list_head	skc_list;	/* List of caches linkage */
	struct list_head	skc_complete_list;/* Completely alloc'ed */
	struct list_head	skc_partial_list; /* Partially alloc'ed */
	spinlock_t		skc_lock;	/* Cache lock */
	uint64_t		skc_slab_fail;	/* Slab alloc failures */
	uint64_t		skc_slab_create;/* Slab creates */
	uint64_t		skc_slab_destroy;/* Slab destroys */
	uint64_t		skc_slab_total;	/* Slab total current */
	uint64_t		skc_slab_alloc;	/* Slab alloc current */
	uint64_t		skc_slab_max;	/* Slab max historic  */
	uint64_t		skc_obj_total;	/* Obj total current */
	uint64_t		skc_obj_alloc;	/* Obj alloc current */
	uint64_t		skc_obj_max;	/* Obj max historic */
} spl_kmem_cache_t;
#define kmem_cache_t		spl_kmem_cache_t

extern spl_kmem_cache_t *
spl_kmem_cache_create(char *name, size_t size, size_t align,
        spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
        void *priv, void *vmp, int flags);

extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc);
extern void spl_kmem_reap(void);

int spl_kmem_init_kallsyms_lookup(void);
int spl_kmem_init(void);
void spl_kmem_fini(void);

#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \
        spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags)
#define kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
#define kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
#define kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
#define kmem_cache_reap_now(skc)	spl_kmem_cache_reap_now(skc)
#define kmem_reap()			spl_kmem_reap()
#define kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
					 ((ptr) <  (void *)VMALLOC_END))

#endif	/* _SPL_KMEM_H */