modules/spl/spl-kmem.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538

#include <sys/kmem.h>

#ifdef DEBUG_SUBSYSTEM
#undef DEBUG_SUBSYSTEM
#endif

#define DEBUG_SUBSYSTEM S_KMEM

/*
 * Memory allocation interfaces
 */
#ifdef DEBUG_KMEM
/* Shim layer memory accounting */
atomic64_t kmem_alloc_used;
unsigned long kmem_alloc_max = 0;
atomic64_t vmem_alloc_used;
unsigned long vmem_alloc_max = 0;
int kmem_warning_flag = 1;
atomic64_t kmem_cache_alloc_failed;

spinlock_t kmem_lock;
struct hlist_head kmem_table[KMEM_TABLE_SIZE];
struct list_head kmem_list;

spinlock_t vmem_lock;
struct hlist_head vmem_table[VMEM_TABLE_SIZE];
struct list_head vmem_list;

EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
EXPORT_SYMBOL(vmem_alloc_used);
EXPORT_SYMBOL(vmem_alloc_max);
EXPORT_SYMBOL(kmem_warning_flag);

EXPORT_SYMBOL(kmem_lock);
EXPORT_SYMBOL(kmem_table);
EXPORT_SYMBOL(kmem_list);

EXPORT_SYMBOL(vmem_lock);
EXPORT_SYMBOL(vmem_table);
EXPORT_SYMBOL(vmem_list);

int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
#else
int kmem_set_warning(int flag) { return 0; }
#endif
EXPORT_SYMBOL(kmem_set_warning);

/*
 * Slab allocation interfaces
 *
 * While the linux slab implementation was inspired by solaris they
 * have made some changes to the API which complicates this shim
 * layer.  For one thing the same symbol names are used with different
 * arguments for the prototypes.  To deal with this we must use the
 * preprocessor to re-order arguments.  Happily for us standard C says,
 * "Macro's appearing in their own expansion are not reexpanded" so
 * this does not result in an infinite recursion.  Additionally the
 * function pointers registered by solarias differ from those used
 * by linux so a lookup and mapping from linux style callback to a
 * solaris style callback is needed.  There is some overhead in this
 * operation which isn't horibile but it needs to be kept in mind.
 */
#define KCC_MAGIC                0x7a7a7a7a
#define KCC_POISON               0x77

typedef struct kmem_cache_cb {
        int                 kcc_magic;
        struct list_head    kcc_list;
        kmem_cache_t *      kcc_cache;
        kmem_constructor_t  kcc_constructor;
        kmem_destructor_t   kcc_destructor;
        kmem_reclaim_t      kcc_reclaim;
        void *              kcc_private;
        void *              kcc_vmp;
	atomic_t            kcc_ref;
} kmem_cache_cb_t;

static struct rw_semaphore kmem_cache_cb_sem;
static struct list_head kmem_cache_cb_list;
static struct shrinker *kmem_cache_shrinker;

/* Function must be called while holding the kmem_cache_cb_sem
 * Because kmem_cache_t is an opaque datatype we're forced to
 * match pointers to identify specific cache entires.
 */
static kmem_cache_cb_t *
kmem_cache_find_cache_cb(kmem_cache_t *cache)
{
        kmem_cache_cb_t *kcc;
#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
        ASSERT(rwsem_is_locked(&kmem_cache_cb_sem));
#endif

        list_for_each_entry(kcc, &kmem_cache_cb_list, kcc_list)
		if (cache == kcc->kcc_cache)
                        return kcc;

        return NULL;
}

static kmem_cache_cb_t *
kmem_cache_add_cache_cb(kmem_cache_t *cache,
			kmem_constructor_t constructor,
                        kmem_destructor_t destructor,
			kmem_reclaim_t reclaim,
                        void *priv, void *vmp)
{
        kmem_cache_cb_t *kcc;

        kcc = (kmem_cache_cb_t *)kmalloc(sizeof(*kcc), GFP_KERNEL);
        if (kcc) {
		kcc->kcc_magic = KCC_MAGIC;
		kcc->kcc_cache = cache;
                kcc->kcc_constructor = constructor;
                kcc->kcc_destructor = destructor;
                kcc->kcc_reclaim = reclaim;
                kcc->kcc_private = priv;
                kcc->kcc_vmp = vmp;
		atomic_set(&kcc->kcc_ref, 0);
		down_write(&kmem_cache_cb_sem);
                list_add(&kcc->kcc_list, &kmem_cache_cb_list);
		up_write(&kmem_cache_cb_sem);
        }

        return kcc;
}

static void
kmem_cache_remove_cache_cb(kmem_cache_cb_t *kcc)
{
        down_write(&kmem_cache_cb_sem);
	ASSERT(atomic_read(&kcc->kcc_ref) == 0);
        list_del(&kcc->kcc_list);
        up_write(&kmem_cache_cb_sem);

        if (kcc){
		memset(kcc, KCC_POISON, sizeof(*kcc));
                kfree(kcc);
	}
}

static void
kmem_cache_generic_constructor(void *ptr, kmem_cache_t *cache, unsigned long flags)
{
        kmem_cache_cb_t *kcc;
	kmem_constructor_t constructor;
	void *private;

	ASSERT(flags & SLAB_CTOR_CONSTRUCTOR);

	/* Ensure constructor verifies are not passed to the registered
	 * constructors.  This may not be safe due to the Solaris constructor
	 * not being aware of how to handle the SLAB_CTOR_VERIFY flag
	 */
	if (flags & SLAB_CTOR_VERIFY)
		return;

	if (flags & SLAB_CTOR_ATOMIC)
		flags = KM_NOSLEEP;
	else
		flags = KM_SLEEP;

	/* We can be called with interrupts disabled so it is critical that
	 * this function and the registered constructor never sleep.
	 */
        while (!down_read_trylock(&kmem_cache_cb_sem));

        /* Callback list must be in sync with linux slab caches */
        kcc = kmem_cache_find_cache_cb(cache);
        ASSERT(kcc);
	ASSERT(kcc->kcc_magic == KCC_MAGIC);
	atomic_inc(&kcc->kcc_ref);

	constructor = kcc->kcc_constructor;
	private = kcc->kcc_private;

        up_read(&kmem_cache_cb_sem);

	if (constructor)
		constructor(ptr, private, (int)flags);

	atomic_dec(&kcc->kcc_ref);

	/* Linux constructor has no return code, silently eat it */
}

static void
kmem_cache_generic_destructor(void *ptr, kmem_cache_t *cache, unsigned long flags)
{
        kmem_cache_cb_t *kcc;
        kmem_destructor_t destructor;
	void *private;

	/* No valid destructor flags */
	ASSERT(flags == 0);

	/* We can be called with interrupts disabled so it is critical that
	 * this function and the registered constructor never sleep.
	 */
        while (!down_read_trylock(&kmem_cache_cb_sem));

        /* Callback list must be in sync with linux slab caches */
        kcc = kmem_cache_find_cache_cb(cache);
	ASSERT(kcc);
	ASSERT(kcc->kcc_magic == KCC_MAGIC);
	atomic_inc(&kcc->kcc_ref);

	destructor = kcc->kcc_destructor;
	private = kcc->kcc_private;

        up_read(&kmem_cache_cb_sem);

	/* Solaris destructor takes no flags, silently eat them */
	if (destructor)
		destructor(ptr, private);

	atomic_dec(&kcc->kcc_ref);
}

/* XXX - Arguments are ignored */
static int
kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
{
        kmem_cache_cb_t *kcc;
        int total = 0;

	/* Under linux a shrinker is not tightly coupled with a slab
	 * cache.  In fact linux always systematically trys calling all
	 * registered shrinker callbacks until its target reclamation level
	 * is reached.  Because of this we only register one shrinker
	 * function in the shim layer for all slab caches.  And we always
	 * attempt to shrink all caches when this generic shrinker is called.
	 */
        down_read(&kmem_cache_cb_sem);

        list_for_each_entry(kcc, &kmem_cache_cb_list, kcc_list) {
	        ASSERT(kcc);
                ASSERT(kcc->kcc_magic == KCC_MAGIC);

		/* Take a reference on the cache in question.  If that
		 * cache is contended simply skip it, it may already be
		 * in the process of a reclaim or the ctor/dtor may be
		 * running in either case it's best to skip it.
		 */
	        atomic_inc(&kcc->kcc_ref);
		if (atomic_read(&kcc->kcc_ref) > 1) {
	                atomic_dec(&kcc->kcc_ref);
			continue;
		}

	        /* Under linux the desired number and gfp type of objects
		 * is passed to the reclaiming function as a sugested reclaim
		 * target.  I do not pass these args on because reclaim
		 * policy is entirely up to the owner under solaris.  We only
		 * pass on the pre-registered private data.
                 */
		if (kcc->kcc_reclaim)
                        kcc->kcc_reclaim(kcc->kcc_private);

	        atomic_dec(&kcc->kcc_ref);
	        total += 1;
        }

	/* Under linux we should return the remaining number of entires in
	 * the cache.  Unfortunately, I don't see an easy way to safely
	 * emulate this behavior so I'm returning one entry per cache which
	 * was registered with the generic shrinker.  This should fake out
	 * the linux VM when it attempts to shrink caches.
	 */
        up_read(&kmem_cache_cb_sem);

	return total;
}

/* Ensure the __kmem_cache_create/__kmem_cache_destroy macros are
 * removed here to prevent a recursive substitution, we want to call
 * the native linux version.
 */
#undef kmem_cache_create
#undef kmem_cache_destroy
#undef kmem_cache_alloc

kmem_cache_t *
__kmem_cache_create(char *name, size_t size, size_t align,
        kmem_constructor_t constructor,
	kmem_destructor_t destructor,
	kmem_reclaim_t reclaim,
        void *priv, void *vmp, int flags)
{
        kmem_cache_t *cache;
        kmem_cache_cb_t *kcc;
	int shrinker_flag = 0;
	char *cache_name;
	ENTRY;

        /* XXX: - Option currently unsupported by shim layer */
        ASSERT(!vmp);
	ASSERT(flags == 0);

	cache_name = kzalloc(strlen(name) + 1, GFP_KERNEL);
	if (cache_name == NULL)
		RETURN(NULL);

	strcpy(cache_name, name);
        cache = kmem_cache_create(cache_name, size, align, flags,
                                  kmem_cache_generic_constructor,
                                  kmem_cache_generic_destructor);
	if (cache == NULL)
                RETURN(NULL);

        /* Register shared shrinker function on initial cache create */
        down_read(&kmem_cache_cb_sem);
	if (list_empty(&kmem_cache_cb_list)) {
                kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
                                                 kmem_cache_generic_shrinker);
                if (kmem_cache_shrinker == NULL) {
                        kmem_cache_destroy(cache);
                        up_read(&kmem_cache_cb_sem);
                        RETURN(NULL);
                }

        }
        up_read(&kmem_cache_cb_sem);

        kcc = kmem_cache_add_cache_cb(cache, constructor, destructor,
                                      reclaim, priv, vmp);
        if (kcc == NULL) {
		if (shrinker_flag) /* New shrinker registered must be removed */
			remove_shrinker(kmem_cache_shrinker);

                kmem_cache_destroy(cache);
                RETURN(NULL);
        }

        RETURN(cache);
}
EXPORT_SYMBOL(__kmem_cache_create);

/* Return code provided despite Solaris's void return.  There should be no
 * harm here since the Solaris versions will ignore it anyway. */
int
__kmem_cache_destroy(kmem_cache_t *cache)
{
        kmem_cache_cb_t *kcc;
	char *name;
	int rc;
	ENTRY;

        down_read(&kmem_cache_cb_sem);
        kcc = kmem_cache_find_cache_cb(cache);
        if (kcc == NULL) {
                up_read(&kmem_cache_cb_sem);
                RETURN(-EINVAL);
        }
	atomic_inc(&kcc->kcc_ref);
        up_read(&kmem_cache_cb_sem);

	name = (char *)kmem_cache_name(cache);
        rc = kmem_cache_destroy(cache);

	atomic_dec(&kcc->kcc_ref);
        kmem_cache_remove_cache_cb(kcc);
	kfree(name);

	/* Unregister generic shrinker on removal of all caches */
        down_read(&kmem_cache_cb_sem);
	if (list_empty(&kmem_cache_cb_list))
                remove_shrinker(kmem_cache_shrinker);

        up_read(&kmem_cache_cb_sem);
	RETURN(rc);
}
EXPORT_SYMBOL(__kmem_cache_destroy);

/* Under Solaris if the KM_SLEEP flag is passed we absolutely must
 * sleep until we are allocated the memory.  Under Linux you can still
 * get a memory allocation failure, so I'm forced to keep requesting
 * the memory even if the system is under substantial memory pressure
 * of fragmentation prevents the allocation from succeeded.  This is
 * not the correct fix, or even a good one.  But it will do for now.
 */
void *
__kmem_cache_alloc(kmem_cache_t *cache, gfp_t flags)
{
	void *rc;
	ENTRY;

restart:
	rc = kmem_cache_alloc(cache, flags);
        if ((rc == NULL) && (flags & KM_SLEEP)) {
#ifdef DEBUG_KMEM
		atomic64_inc(&kmem_cache_alloc_failed);
#endif /* DEBUG_KMEM */
		GOTO(restart, rc);
	}

	RETURN(rc);
}
EXPORT_SYMBOL(__kmem_cache_alloc);

void
__kmem_reap(void)
{
	ENTRY;
	/* Since there's no easy hook in to linux to force all the registered
	 * shrinkers to run we just run the ones registered for this shim */
	kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
	EXIT;
}
EXPORT_SYMBOL(__kmem_reap);

int
kmem_init(void)
{
        ENTRY;

	init_rwsem(&kmem_cache_cb_sem);
        INIT_LIST_HEAD(&kmem_cache_cb_list);
#ifdef DEBUG_KMEM
        {
                int i;
		atomic64_set(&kmem_alloc_used, 0);
		atomic64_set(&vmem_alloc_used, 0);

                spin_lock_init(&kmem_lock);
                INIT_LIST_HEAD(&kmem_list);

                for (i = 0; i < KMEM_TABLE_SIZE; i++)
                        INIT_HLIST_HEAD(&kmem_table[i]);

                spin_lock_init(&vmem_lock);
                INIT_LIST_HEAD(&vmem_list);

                for (i = 0; i < VMEM_TABLE_SIZE; i++)
                        INIT_HLIST_HEAD(&vmem_table[i]);

		atomic64_set(&kmem_cache_alloc_failed, 0);
        }
#endif
	RETURN(0);
}

#ifdef DEBUG_KMEM
static char *
sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
{
        int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
	int i, flag = 1;

	ASSERT(str != NULL && len >= 17);
        memset(str, 0, len);

	/* Check for a fully printable string, and while we are at
         * it place the printable characters in the passed buffer. */
	for (i = 0; i < size; i++) {
                str[i] = ((char *)(kd->kd_addr))[i];
                if (isprint(str[i])) {
                        continue;
                } else {
                        /* Minimum number of printable characters found
                         * to make it worthwhile to print this as ascii. */
                        if (i > min)
                                break;

                         flag = 0;
                         break;
                }

	}

	if (!flag) {
		sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
		        *((uint8_t *)kd->kd_addr),
		        *((uint8_t *)kd->kd_addr + 2),
		        *((uint8_t *)kd->kd_addr + 4),
		        *((uint8_t *)kd->kd_addr + 6),
		        *((uint8_t *)kd->kd_addr + 8),
		        *((uint8_t *)kd->kd_addr + 10),
		        *((uint8_t *)kd->kd_addr + 12),
		        *((uint8_t *)kd->kd_addr + 14));
	}

	return str;
}
#endif /* DEBUG_KMEM */

void
kmem_fini(void)
{
	ENTRY;
#ifdef DEBUG_KMEM
        {
                unsigned long flags;
                kmem_debug_t *kd;
		char str[17];

		/* Display all unreclaimed memory addresses, including the
		 * allocation size and the first few bytes of what's located
		 * at that address to aid in debugging.  Performance is not
		 * a serious concern here since it is module unload time. */
                if (atomic64_read(&kmem_alloc_used) != 0)
                        CWARN("kmem leaked %ld/%ld bytes\n",
                               atomic_read(&kmem_alloc_used), kmem_alloc_max);

                spin_lock_irqsave(&kmem_lock, flags);
                if (!list_empty(&kmem_list))
                        CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n",
			       "address", "size", "data", "func", "line");

                list_for_each_entry(kd, &kmem_list, kd_list)
                        CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n",
			       kd->kd_addr, kd->kd_size,
                               sprintf_addr(kd, str, 17, 8),
			       kd->kd_func, kd->kd_line);

                spin_unlock_irqrestore(&kmem_lock, flags);

                if (atomic64_read(&vmem_alloc_used) != 0)
                        CWARN("vmem leaked %ld/%ld bytes\n",
                               atomic_read(&vmem_alloc_used), vmem_alloc_max);

                spin_lock_irqsave(&vmem_lock, flags);
                if (!list_empty(&vmem_list))
                        CDEBUG(D_WARNING, "%-16s %-5s %-16s %s:%s\n",
			       "address", "size", "data", "func", "line");

                list_for_each_entry(kd, &vmem_list, kd_list)
                        CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n",
			       kd->kd_addr, kd->kd_size,
                               sprintf_addr(kd, str, 17, 8),
			       kd->kd_func, kd->kd_line);

                spin_unlock_irqrestore(&vmem_lock, flags);
        }
#endif
	EXIT;
}